Louis Becquey

automated Carnaval installation

......@@ -83,12 +83,14 @@ Get the latest version of the HL and IL module models from the [BGSU website](ht
### CARNAVAL DATA
You first need to have the 'networkx' package installed for Python 3. Then just run the script `./scripts/transform_CaRNAval_pickle.py`, this will create files into `./data/modules/RIN/Subfiles` :
You first need to have the `unzip` command installed on your machine and the `networkx` package installed for Python 3. Then just run the script `Install_CaRNAval_RINs.py`, this will create files into `./data/modules/RIN/Subfiles` :
```bash
cd scripts
python3 transform_CaRNAval_pickle.py
python3 Install_CaRNAval_RINs.py
```
If you do not have the unzip command, download and extract manually the [CaRNAval dataset](http://carnaval.lri.fr/carnaval_dataset.zip) and place the files `RIN.py` and `CaRNAval_1_as_dictionnary.nxpickled` in the folder `data/modules/RIN/`, and run the python script.
### DEPENDENCIES
- Make sure you have Python 3.7+ and a C++ compiler (tested with GCC and clang) installed on your distribution. Use a recent one, we use the 2017 C++ standard. The compilation will not work with Ubuntu 16's GCC 5.4 for example.
- Install automake, libeigen3-dev, libboost-program-options-dev and libboost-filesystem-dev, or equivalent packages in your distribution (Eigen 3 and Boost headers).
......
......@@ -14,7 +14,7 @@ from ast import literal_eval
try:
cmd_opts, cmd_args = getopt.getopt( sys.argv[1:],
"bc:f:hi:jl:no:O:pt:v",
[ "verbose","rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func=",
[ "verbose","rna3dmotifs","3dmotifatlas","carnaval","jar3d","bayespairing","patternmatch","func=",
"help","version","seq=","modules-path=", "jar3d-exec=", "bypdir=", "biorseo-dir=", "first-objective=","output=","theta=",
"interrupt-limit=", "outputf="])
except getopt.GetoptError as err:
......@@ -242,7 +242,7 @@ class BiorseoInstance:
self.type = "jar3d"
elif opt == "-b" or opt == "--bayespairing":
self.type = "byp"
elif opt == "--rin":
elif opt == "--carnaval":
self.modules = "rin"
elif opt == "--rna3dmotifs":
self.modules = "desc"
......@@ -792,7 +792,7 @@ class BiorseoInstance:
# define job list
for instance in RNAcontainer:
executable = self.biorseo_dir + "/bin/biorseo"
executable = self.biorseo_dir + "bin/biorseo"
fastafile = self.temp_dir+instance.header+".fa"
method_type = ""
priority = 1
......
......@@ -20,13 +20,6 @@
using namespace boost::filesystem;
using namespace std;
using std::abs;
using std::cerr;
using std::cout;
using std::endl;
using std::make_pair;
using std::vector;
char MOIP::obj_function_nbr_ = 'A';
uint MOIP::obj_to_solve_ = 1;
double MOIP::precision_ = 1e-5;
......@@ -207,70 +200,61 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
}
else if (source == "rinfolder")
{
vector<Motif> motifs;
string valid_path = source_path;
string reversed_rna = rna_.get_seq();
std::reverse(reversed_rna.begin(), reversed_rna.end());
if (valid_path.back() != '/') valid_path.push_back('/');
if (verbose) cout << "loading RIN motifs from " << valid_path << "..." << endl;
size_t number_of_files = (size_t) std::distance(std::filesystem::directory_iterator{source_path}, std::filesystem::directory_iterator{});
if (verbose) cout << "Number of files : " << number_of_files << endl;
size_t number_of_files = (size_t) distance(directory_iterator{source_path}, directory_iterator{});
if (verbose) cout << "\t> Number of files : " << number_of_files << endl;
for (size_t i=0; i<number_of_files; i++) //337 is the number of RINs in CaRNAval
{
motifs.push_back(Motif()) ;
motifs.back().load_from_txt(valid_path, i);
Motif m = Motif(source_path, i)
vector<string> vc;
string motif_seq = "" ;
string motif_seq = "";
for (Component component : motifs.back().comp)
for (Component component : m.comp)
{
vc.push_back(component.seq_) ;
motif_seq += component.seq_ ;
vc.push_back(component.seq_);
motif_seq += component.seq_;
}
if (motif_seq.length() < 5)
{
if (verbose) std::cout << "RIN n°" << i+1 << " is too short to be considered." << std::endl ;
motifs.pop_back();
continue ;
if (verbose) cout << "RIN n°" << i+1 << " is too short to be considered." << endl;
continue;
}
if (motifs.back().links_.size() == 0)
if (m.links_.size() == 0)
{
if (verbose) std::cout << "RIN n°" << i+1 << " is not considered for not constraining the secondary structure." << std::endl ;
motifs.pop_back();
continue ;
if (verbose) cout << "RIN n°" << i+1 << " is not considered, because not constraining the secondary structure." << endl;
continue;
}
vector<vector<Component>> occurrences = find_next_ones_in(rna_.get_seq(), 0, vc) ;
vector<vector<Component>> r_occurrences = find_next_ones_in(reversed_rna, 0, vc) ;
vector<vector<Component>> occurrences = find_next_ones_in(rna_.get_seq(), 0, vc);
vector<vector<Component>> r_occurrences = find_next_ones_in(reversed_rna, 0, vc);
motifs.pop_back() ;
motifs.pop_back();
for (vector<Component> occ : occurrences)
{
motifs.push_back(Motif()) ;
motifs.back().load_from_txt(valid_path, i);
motifs.back().comp = occ ;
motifs.back().reversed_ = false ;
motifs.push_back(Motif());
m.load_from_txt(source_path, i);
m.comp = occ;
m.reversed_ = false;
bool to_keep = true;
if (!(allowed_basepair(motifs.back().comp[0].pos.first, motifs.back().comp.back().pos.second)))
if (!(allowed_basepair(m.comp[0].pos.first, m.comp.back().pos.second)))
to_keep = false;
else if (motifs.back().comp.size() != 1)
for (size_t j = 0; j < motifs.back().comp.size() - 1; j++)
if ( !(allowed_basepair(motifs.back().comp[0].pos.first, motifs.back().comp.back().pos.second)))
else if (m.comp.size() != 1)
for (size_t j = 0; j < m.comp.size() - 1; j++)
if ( !(allowed_basepair(m.comp[0].pos.first, m.comp.back().pos.second)))
{
to_keep = false;
j = motifs.back().comp.size();
j = m.comp.size();
}
if (to_keep == false)
......@@ -279,22 +263,22 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
for (vector<Component> occ : r_occurrences)
{
motifs.push_back(Motif()) ;
motifs.back().load_from_txt(valid_path, i);
motifs.back().comp = occ ;
motifs.back().reversed_ = true ;
motifs.push_back(Motif());
m.load_from_txt(source_path, i);
m.comp = occ;
m.reversed_ = true;
bool to_keep = true;
if (!(allowed_basepair(motifs.back().comp[0].pos.first, motifs.back().comp.back().pos.second)))
if (!(allowed_basepair(m.comp[0].pos.first, m.comp.back().pos.second)))
to_keep = false;
else if (motifs.back().comp.size() != 1)
for (size_t j = 0; j < motifs.back().comp.size() - 1; j++)
if ( !(allowed_basepair(motifs.back().comp[0].pos.first, motifs.back().comp.back().pos.second)))
else if (m.comp.size() != 1)
for (size_t j = 0; j < m.comp.size() - 1; j++)
if ( !(allowed_basepair(m.comp[0].pos.first, m.comp.back().pos.second)))
{
to_keep = false;
j = motifs.back().comp.size();
j = m.comp.size();
}
if (to_keep == false)
......@@ -304,7 +288,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
if (verbose) cout << "Done : parsed " << number_of_files << " files." << endl;
insertion_sites_ = motifs ;
insertion_sites_ = motifs;
}
else
{
......@@ -479,12 +463,12 @@ void MOIP::define_problem_constraints(string& source)
else
{
bool is_link = false ;
bool is_link = false;
for (Link link : x.links_)
if ((u==link.nts.first and v==link.nts.second) or (u==link.nts.second and v==link.nts.first))
{
is_link = true ;
break ;
is_link = true;
break;
}
if (!is_link)
......@@ -550,29 +534,29 @@ void MOIP::define_problem_constraints(string& source)
Motif& x = insertion_sites_[i];
//IloExpr c6p = IloExpr(env_);
vector<size_t> weights(x.comp.size(), 0) ;
vector<size_t> weights(x.comp.size(), 0);
vector<vector<IloExpr>> expressions(x.comp.size(), vector<IloExpr>());
size_t sum_comp_size = 0 ;
size_t sum_comp_size = 0;
for (size_t j=0; j < x.comp.size(); j++)
{
IloExpr c6 = IloExpr(env_);
bool to_insert = false ;
size_t jj ;
bool to_insert = false;
size_t jj;
for (size_t k=0; k < x.links_.size(); k++)
{
size_t ntA = x.links_[k].nts.first ;
size_t ntB = x.links_[k].nts.second ;
size_t ntA = x.links_[k].nts.first;
size_t ntB = x.links_[k].nts.second;
//check if the j component is the first to be linked in the k link
if( sum_comp_size <= ntA && ntA < sum_comp_size + x.comp[j].k )
{
size_t ntA_location = x.comp[j].pos.first + ntA - sum_comp_size ;
size_t ntB_location = -1 ;
size_t ntA_location = x.comp[j].pos.first + ntA - sum_comp_size;
size_t ntB_location = -1;
size_t sum_next_comp_size = sum_comp_size ;
size_t sum_next_comp_size = sum_comp_size;
//look for the location of the other linked nucleotide
for (jj=j; jj < x.comp.size(); jj++)
......@@ -580,48 +564,48 @@ void MOIP::define_problem_constraints(string& source)
//check if the jj component is the second to be linked in the k link
if( sum_next_comp_size <= ntB && ntB < sum_next_comp_size + x.comp[jj].k )
{
ntB_location = x.comp[jj].pos.first + ntB - sum_next_comp_size ;
break ;
ntB_location = x.comp[jj].pos.first + ntB - sum_next_comp_size;
break;
}
sum_next_comp_size += x.comp[jj].k ;
sum_next_comp_size += x.comp[jj].k;
}
if (allowed_basepair(ntA_location, ntB_location))
{
c6 += y(ntA_location, ntB_location) ;
to_insert = true ;
c6 += y(ntA_location, ntB_location);
to_insert = true;
}
else //a link is unauthorized, the component cannot be inserted
{
to_insert = false ;
break ;
to_insert = false;
break;
}
}
}
sum_comp_size += x.comp[j].k ;
sum_comp_size += x.comp[j].k;
if (to_insert)
{
if (j==jj)
{
//model_.add(C(i,j) <= c6) ;
//weights[j] += 2 ;
weights[j] += 1 ;
expressions[j].push_back(c6) ;
//model_.add(C(i,j) <= c6);
//weights[j] += 2;
weights[j] += 1;
expressions[j].push_back(c6);
//if (verbose_) cout << "\t\t" << (C(i, j) <= c6) << endl;
}
else
{
//model_.add(C(i,j) <= c6) ;
weights[j] += 1 ;
expressions[j].push_back(c6) ;
//model_.add(C(i,j) <= c6);
weights[j] += 1;
expressions[j].push_back(c6);
//if (verbose_) cout << "\t\t" << (C(i, j) <= c6) << endl;
//model_.add(C(i,jj) <= c6) ;
weights[jj] += 1 ;
expressions[jj].push_back(c6) ;
//model_.add(C(i,jj) <= c6);
weights[jj] += 1;
expressions[jj].push_back(c6);
//if (verbose_) cout << "\t\t" << (C(i, jj) <= c6) << endl;
}
}
......@@ -632,7 +616,7 @@ void MOIP::define_problem_constraints(string& source)
if (expressions[j].size() != 0)
for (size_t k=0; k<expressions[j].size(); k++)
{
model_.add( IloNum(weights[j]) * C(i,j) <= (expressions[j])[k] ) ;
model_.add( IloNum(weights[j]) * C(i,j) <= (expressions[j])[k] );
if (verbose_) cout << "\t\t" << (IloNum(weights[j]) * C(i, j) <= (expressions[j])[k]) << endl;
}
}
......
......@@ -88,106 +88,91 @@ Motif::Motif(string csv_line)
void Motif::load_from_txt(string path, int id)
Motif::Motif(string path, int id)
{
carnaval_id = to_string(1 + id) ; // Start counting at 1 to be consistant with the website numbering
atlas_id = "" ;
PDBID = "" ;
is_model_ = false ;
source_ = CARNAVAL ;
// Loads a motif from the RIN file of Carnaval
carnaval_id = to_string(1 + id); // Start counting at 1 to be consistant with the website numbering
atlas_id = "";
PDBID = "";
is_model_ = false;
source_ = CARNAVAL;
/*-----comp-----*/
std::ifstream file(path + to_string(id) + ".txt") ;
std::ifstream file(path + to_string(id) + ".txt");
if (file.is_open())
{
string line ;
std::getline(file,line) ; //skip the header_link line
string line;
std::getline(file,line) ; //get the links line
string link_str ;
size_t index = 0 ;
string nt_str ;
size_t sub_index = 0 ;
std::getline(file,line); //skip the header_link line
std::getline(file,line); //get the links line
string link_str;
size_t index = 0;
string nt_str;
size_t sub_index = 0;
while (line != "")
{
Link link ;
Link link;
//link.nts
index = line.find(";") ;
link_str = line.substr(0, index) ;
//std::cout << "link_str :" << link_str << std::endl ;
line.erase(0, index+1) ;
sub_index = link_str.find(",") ;
nt_str = link_str.substr(0, sub_index) ;
//std::cout << "nt_str :" << nt_str << std::endl ;
link_str.erase(0, sub_index+1) ;
link.nts.first = std::stoi(nt_str) ;
sub_index = link_str.find(",") ;
nt_str = link_str.substr(0, sub_index) ;
//std::cout << "nt_str :" << nt_str << std::endl ;
link_str.erase(0, sub_index+1) ;
link.nts.second = std::stoi(nt_str) ;
index = line.find(";");
link_str = line.substr(0, index);
line.erase(0, index+1);
//link.long_range
//std::cout << "link_str :" << link_str << std::endl << std::endl ;
link.long_range = (link_str == "True") ;
links_.push_back(link) ;
}
sub_index = link_str.find(",");
nt_str = link_str.substr(0, sub_index);
link_str.erase(0, sub_index+1);
link.nts.first = std::stoi(nt_str);
sub_index = link_str.find(",");
nt_str = link_str.substr(0, sub_index);
link_str.erase(0, sub_index+1);
link.nts.second = std::stoi(nt_str);
std::getline(file,line) ; //skip the header_comp line
//link.long_range
link.long_range = (link_str == "True");
string pos_str ;
string sub_pos_str ;
links_.push_back(link);
}
string k_str ;
string seq ;
std::getline(file,line); //skip the header_comp line
string pos_str, sub_pos_str, k_str, seq;
while ( std::getline(file,line) )
{
if (line == "\n") break ; //skip last line (empty)
if (line == "\n") break; //skip last line (empty)
Component c(0,0) ;
Component c(0,0);
//c.pos
index = line.find(";") ;
pos_str = line.substr(0, index) ;
line.erase(0, index+1) ;
sub_index = pos_str.find(",") ;
sub_pos_str = pos_str.substr(0, sub_index) ;
//std::cout << sub_pos_str << " ";
pos_str.erase(0, sub_index+1) ;
c.pos.first = std::stoi(sub_pos_str) ;
//std::cout << pos_str << " ";
c.pos.second = std::stoi(pos_str) ;
index = line.find(";");
pos_str = line.substr(0, index);
line.erase(0, index+1);
sub_index = pos_str.find(",");
sub_pos_str = pos_str.substr(0, sub_index);
pos_str.erase(0, sub_index+1);
c.pos.first = std::stoi(sub_pos_str);
c.pos.second = std::stoi(pos_str);
//c.k
index = line.find(";") ;
k_str = line.substr(0, index) ;
//std::cout << k_str << " ";
line.erase(0, index+1) ;
c.k = std::stoi(k_str) ;
index = line.find(";");
k_str = line.substr(0, index);
line.erase(0, index+1);
c.k = std::stoi(k_str);
//c.seq_
seq = line ;
//std::cout << line << "\n" ;
c.seq_ = seq ;
seq = line;
c.seq_ = seq;
comp.push_back(c) ;
comp.push_back(c);
}
}
else std::cout << "Motif::load_from_txt -> File not found : " + path + carnaval_id + ".txt" << std::endl ;
else cout << "RIN file not found : " + path + carnaval_id + ".txt" << endl;
}
......@@ -263,60 +248,60 @@ char Motif::is_valid_DESC(const string& descfile)
vector<Motif> Motif::RIN_list(const string& rna, bool reversed)
{
string used_rna = rna ;
if (reversed) std::reverse(used_rna.begin(), used_rna.end()) ;
string used_rna = rna;
if (reversed) std::reverse(used_rna.begin(), used_rna.end());
vector<Motif> res;
vector< vector<int> > comps_starts ; //list of beginning indexes by components
vector< vector<int> > comps_starts; //list of beginning indexes by components
vector<int> ks;
size_t index ;
size_t index;
for (Component& component : comp)
{
index = 0 ;
vector<int> starts ;
index = 0;
vector<int> starts;
while (index != string::npos)
{
index = used_rna.find(component.seq_, index) ;
index = used_rna.find(component.seq_, index);
starts.push_back(index);
}
comps_starts.push_back(starts) ;
ks.push_back(component.k) ;
comps_starts.push_back(starts);
ks.push_back(component.k);
}
return res ;
return res;
}
bool Motif::is_valid(const string& rna, bool reversed) //renvoyer un vecteur de motifs
{
size_t index = 0 ;
reversed_ = reversed ;
size_t index = 0;
reversed_ = reversed;
for (Component& component : comp)
{
index = rna.find(component.seq_, index) ;
index = rna.find(component.seq_, index);
if (index == string::npos) //seq_ not found
{
if (reversed) return false ; //searched through rna and reversed rna, still nothing
if (reversed) return false; //searched through rna and reversed rna, still nothing
else //try to look through the reversed sequence
{
string reversed_rna(rna) ;
std::reverse(reversed_rna.begin(), reversed_rna.end()) ;
return is_valid(reversed_rna, true) ;
string reversed_rna(rna);
std::reverse(reversed_rna.begin(), reversed_rna.end());
return is_valid(reversed_rna, true);
}
}
component.pos.first = index ;
component.pos.second = index + component.k - 1 ;
component.pos.first = index;
component.pos.second = index + component.k - 1;
}
return true ;
return true;
}
......@@ -369,7 +354,6 @@ bool is_desc_insertible(const string& descfile, const string& rna)
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc)
{
pair<uint, uint> pos;
......
......@@ -44,7 +44,7 @@ class Motif
Motif(void);
Motif(string csv_line);
Motif(const vector<Component>& v, string PDB);
void load_from_txt(string path, int id); //full path to biorseo/data/modules/CaRNAval/Subfiles/
Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
bool is_valid(const string& rna, bool reversed);
vector<Motif> RIN_list(const string& rna, bool reversed);
static char is_valid_DESC(const string& descfile);
......
......@@ -5,7 +5,7 @@
# We do this because the official JSON file is hard to understand, and Antoine Soulé
# recommended the pickle.
import networkx, os, pickle, sys
import networkx, os, pickle, subprocess, sys
if __name__=="__main__":
......@@ -17,9 +17,14 @@ if __name__=="__main__":
try:
sys.path.append(os.path.abspath(rin_DIR))
import RIN
except:
print("File not found:" + rin_DIR + "RIN.py")
exit(1)
except ImportError:
# We have to download it
subprocess.run(["wget", '-O', '../data/modules/carnaval_dataset.zip', "http://carnaval.lri.fr/carnaval_dataset.zip"])
subprocess.run(["unzip", '-ou', '../data/modules/carnaval_dataset.zip', "carnaval_dataset/CaRNAval_1_as_dictionnary.nxpickled", "carnaval_dataset/RIN.py"])
subprocess.run(["rm", "-f", "../data/modules/RIN/", "../data/modules/carnaval_dataset.zip"])
subprocess.run(["mv", "carnaval_dataset/", "../data/modules/RIN/"])
sys.path.append(os.path.abspath(rin_DIR))
import RIN
try:
objects = []
......