Nathalie BERNARD

Ligne de commande : choix entre MEA et MFE possible

......@@ -14,7 +14,7 @@ from ast import literal_eval
try:
cmd_opts, cmd_args = getopt.getopt( sys.argv[1:],
"bc:f:hi:jl:no:O:pt:v",
[ "verbose","rna3dmotifs","3dmotifatlas","carnaval","contacts","jar3d","bayespairing","patternmatch","func=",
[ "verbose","rna3dmotifs","3dmotifatlas","carnaval","contacts","jar3d","bayespairing","patternmatch","func=", "MFE", "MEA",
"help","version","seq=","modules-path=", "jar3d-exec=", "bypdir=", "biorseo-dir=", "first-objective=","output=","theta=",
"interrupt-limit=", "outputf="])
except getopt.GetoptError as err:
......@@ -118,9 +118,10 @@ class InsertionSite:
class Job:
"""A class to store the properties of a tool execution, in order to run similar jobs in parallel."""
def __init__(self, command=None, function=None, args=None, how_many_in_parallel=0, priority=1, timeout=None):
def __init__(self, command=None, function=None, estimator=None, args=None, how_many_in_parallel=0, priority=1, timeout=None):
self.cmd_ = command
self.func_ = function
self.estimator_ = estimator
self.args_ = args
self.priority_ = priority
self.timeout_ = timeout
......@@ -150,6 +151,7 @@ class BiorseoInstance:
self.type = "dpm" # direct pattern mathcing
self.modules = "desc" # ...with Rna3dMotifs "DESC" modules
self.func = 'B' # ...and function B
self.estimator = 'b' # ...and estimator MEA"""
self.forward_options = [] # options to pass to the C++ biorseo
self.jobcount = 0
self.joblist = []
......@@ -174,15 +176,19 @@ class BiorseoInstance:
self.biorseo_dir = "/biorseo"
self.run_dir = path.dirname(path.realpath(__file__))
self.temp_dir = "temp/"
count = 0
for opt, arg in opts:
if opt == "-h" or opt == "--help":
print( "Biorseo, Bi-Objective RNA Structure Efficient Optimizer\n"
"Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules.\n"
"Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2018-2020\n\n")
print("Usage:\tYou must provide:\n\t1) a FASTA input file with -i,\n\t2) a module type with --rna3dmotifs, --carnaval, --contacts or --3dmotifatlas"
"\n\t3) one module placement method in { --patternmatch, --jar3d, --bayespairing }\n\t4) one scoring function with --func A, B, C or D"
"\n\n\tIf you are not using the Docker image: \n\t5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)")
print("Usage:\tYou must provide:\n\t1) a FASTA input file with -i,"
"\n\t2) a module type with --rna3dmotifs, --carnaval, --contacts or --3dmotifatlas"
"\n\t3) one module placement method in { --patternmatch, --jar3d, --bayespairing }"
"\n\t4) one scoring function with --func A, B, C or D"
"\n\t5) one estimator between MFE or MEA"
"\n\n\tIf you are not using the Docker image: \n\t6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)")
print()
print("Options:")
print("-h [ --help ]\t\t\tPrint this help message")
......@@ -202,6 +208,8 @@ class BiorseoInstance:
" Objective function to score module insertions:\n\t\t\t\t (A) insert big modules (B) insert light, high-order modules"
"\n\t\t\t\t (c) insert modules which score well with the sequence\n\t\t\t\t (D) insert light, high-order modules which score well with the sequence."
"\n\t\t\t\t C and D require cannot be used with --patternmatch.")
print("-e [ --MFE ]\t\t\tUse the estimator MFE(Minimum Free Energy)")
print("-a [ --MEA ]\t\t\tUse the estimator MEA(Maximum Expected Energy)")
print("-c [ --first-objective=… ]\t(default 1) Objective to solve in the mono-objective portions of the algorithm."
"\n\t\t\t\t (1) is the module objective given by --func, (2) is the expected accuracy of the structure.")
print("-l [ --limit=… ]\t\t(default 500) Number of solutions in the Pareto set from which"
......@@ -219,9 +227,9 @@ class BiorseoInstance:
print("--biorseo-dir=…\t\t\tPath to the BiORSEO root directory.\n\t\t\t\t Default is /biorseo, you should use this option if you run"
"\n\t\t\t\t BiORSEO from outside the docker image. Use the FULL path.")
print("\nExamples:")
print("biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B")
print("biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800")
print("biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D")
print("biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA")
print("biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B --MEA -l 800")
print("biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA")
print("\nThe allowed module/placement-method/function combinations are:\n")
print(" --patternmatch --bayespairing --jar3d")
print("--rna3dmotifs A. B. A. B. C. D.")
......@@ -275,6 +283,19 @@ class BiorseoInstance:
elif opt == "-l" or opt == "--interrupt-limit":
self.forward_options.append("-l")
self.forward_options.append(arg)
elif opt == "-a" or opt == "--MFA":
if count == 0:
self.forward_options.append("-a")
count = 1
else:
raise "Choose only one estimator between MEA or MFE !"
elif opt == "-e" or opt == "--MFE":
if count == 0:
self.estimator = "a"
self.forward_options.append("-e")
count = 1
else:
raise "Choose only one estimator between MEA or MFE !"
elif opt == "-v" or opt == "--verbose":
self.forward_options.append("-v")
elif opt == "-n" or opt == "--disable-pseudoknots":
......@@ -346,7 +367,8 @@ class BiorseoInstance:
if issues:
print("\nUsage:\tYou must provide:\n\t1) a FASTA input file with -i,\n\t2) one module type in { --rna3dmotifs, --carnaval, --3dmotifatlas, --contacts }"
"\n\t3) one module placement method in { --patternmatch, --jar3d, --bayespairing }\n\t4) one scoring function with --func A, B, C or D"
"\n\n\tIf you are not using the Docker image: \n\t5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)")
"\n\t4) one estimator for the second objective with --MFE or --MEA"
"\n\n\tIf you are not using the Docker image: \n\t6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)")
print("\nThe allowed module/placement-method/function combinations are:\n")
print(" --patternmatch --bayespairing --jar3d")
print("--rna3dmotifs A. B. A. B. C. D.")
......@@ -863,8 +885,12 @@ class BiorseoInstance:
command = [ executable, "-s", fastafile ]
if method_type:
command += [ method_type, csv ]
self.finalname = self.temp_dir + instance.header + ext + self.func
command += [ "-o", self.finalname, "--function", self.func ]
if self.estimator == 'a':
self.finalname = self.temp_dir + instance.header + ext + self.func + " MFE"
else:
self.finalname = self.temp_dir + instance.header + ext + self.func + " MEA"
command += ["-o", self.finalname, "--function", self.func]
command += self.forward_options
self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3))
......
......@@ -24,8 +24,9 @@ using namespace std;
using json = nlohmann::json;
char MOIP::obj_function_nbr_ = 'A';
char MOIP::obj_function2_nbr_ = 'b';
uint MOIP::obj_to_solve_ = 1;
double MOIP::precision_ = 1e-7;
double MOIP::precision_ = 1e-5;
bool MOIP::allow_pk_ = true;
uint MOIP::max_sol_nbr_ = 500;
......@@ -271,23 +272,25 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
{
for(uint j = 0; j < errors_id.size(); j++)
{
cerr << "\t> Ignoring JSON " << errors_id[j].first;
uint error = errors_id[j].second;
switch (error)
{
case 'l': cerr << ", too short to be considered."; break;
case 'x': cerr << ", sequence and secondary structure are of different size."; break;
case 'd' : cerr << ", missing header."; break;
case 'e' : cerr << ", sequence is empty."; break;
case 'f' : cerr << ", 2D is empty."; break;
case 'n' : cerr << ", brackets are not balanced."; break;
case 'k' : cerr << ", a component is too small and got removed."; break;
case 'a' : cerr << ", the number of components is different between contacts and sequence"; break;
case 'b' : cerr << ", the number of nucleotides is different between contacts and sequence"; break;
default: cerr << ", unknown reason";
if(verbose) {
cerr << "\t> Ignoring JSON " << errors_id[j].first;
uint error = errors_id[j].second;
switch (error)
{
case 'l': cerr << ", too short to be considered."; break;
case 'x': cerr << ", sequence and secondary structure are of different size."; break;
case 'd' : cerr << ", missing header."; break;
case 'e' : cerr << ", sequence is empty."; break;
case 'f' : cerr << ", 2D is empty."; break;
case 'n' : cerr << ", brackets are not balanced."; break;
case 'k' : cerr << ", a component is too small and got removed."; break;
case 'a' : cerr << ", the number of components is different between contacts and sequence"; break;
case 'b' : cerr << ", the number of nucleotides is different between contacts and sequence"; break;
default: cerr << ", unknown reason";
}
cerr << endl;
}
cerr << endl;
}
errors++;
}
......@@ -392,36 +395,40 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
}
}
// Define the MFE:
double energy[7][7] = {
{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
{0.0, 1.1, 2.1, 2.2, 1.4, 0.9, 0.6},
{0.0, 2.1, 2.4, 3.3, 2.1, 2.1, 1.4},
{0.0, 2.2, 3.3, 3.4, 2.5, 2.4, 1.5},
{0.0, 1.4, 2.1, 2.5, 1.3, 1.3, 0.5},
{0.0, 0.9, 2.1, 2.4, 1.3, 1.3, 1.0},
{0.0, 0.6, 1.4, 1.5, 0.5, 1.0, 0.3}
};
obj2 = IloExpr(env_);
for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
for (size_t v = u + 4; v < rna_.get_RNA_length(); v++) {
if (get_xij_index(u, v) != rna_.get_RNA_length() * rna_.get_RNA_length() + 1) {
uint type1 = rna_.get_type()[u][v];
uint type2 = rna_.get_type()[u + 1][v - 1];
obj2 += IloNum(energy[type1][type2]) * x(u, v);
{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
{0.0, 1.1, 2.1, 2.2, 1.4, 0.9, 0.6},
{0.0, 2.1, 2.4, 3.3, 2.1, 2.1, 1.4},
{0.0, 2.2, 3.3, 3.4, 2.5, 2.4, 1.5},
{0.0, 1.4, 2.1, 2.5, 1.3, 1.3, 0.5},
{0.0, 0.9, 2.1, 2.4, 1.3, 1.3, 1.0},
{0.0, 0.6, 1.4, 1.5, 0.5, 1.0, 0.3}
};
obj2 = IloExpr(env_);
switch (obj_function2_nbr_) {
case 'a':
// Define the MFE:
for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
for (size_t v = u + 4; v < rna_.get_RNA_length(); v++) {
if (get_xij_index(u, v) != rna_.get_RNA_length() * rna_.get_RNA_length() + 1) {
uint type1 = rna_.get_type()[u][v];
uint type2 = rna_.get_type()[u + 1][v - 1];
obj2 += IloNum(energy[type1][type2]) * x(u, v);
}
}
}
}
break;
case 'b':
// Define the expected accuracy objective function:
//MEA:
for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
for (size_t v = u + 4; v < rna_.get_RNA_length(); v++) {
if (allowed_basepair(u, v)) obj2 += (IloNum(rna_.get_pij(u, v)) * y(u, v));
}
}
break;
}
// Define the expected accuracy objective function:
//MEA:
/*for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
for (size_t v = u + 4; v < rna_.get_RNA_length(); v++) {
if (allowed_basepair(u, v)) obj2 += (IloNum(rna_.get_pij(u, v)) * y(u, v));
}
}*/
//std::cout << "\n fin \n";
}
......@@ -787,8 +794,9 @@ SecondaryStructure MOIP::solve_objective(int o, double min, double max)
void MOIP::search_between(double lambdaMin, double lambdaMax)
{
if (fabs(lambdaMin - lambdaMax) < MOIP::precision_) return;
SecondaryStructure s = solve_objective(obj_to_solve_, lambdaMin, lambdaMax);
//if (fabs(lambdaMin - lambdaMax) < MOIP::precision_) return;
if (lambdaMin - lambdaMax > 0.0) return;
SecondaryStructure s = solve_objective(obj_to_solve_, lambdaMin + MOIP::precision_, lambdaMax);
//cout << "min: " << lambdaMin << " max: " << lambdaMax << endl;
if (!s.is_empty_structure) { // A solution has been found
......@@ -817,10 +825,10 @@ void MOIP::search_between(double lambdaMin, double lambdaMax)
double max = lambdaMax;
if (verbose_)
cout << std::setprecision(-log10(precision_) + 4) << "\nSolving objective function " << obj_to_solve_
cout << std::setprecision(-log10(precision_) + 7) << "\nSolving objective function " << obj_to_solve_
<< ", on top of " << s.get_objective_score(3 - obj_to_solve_) << ": Obj" << 3 - obj_to_solve_
<< " being in [" << std::setprecision(-log10(precision_) + 4) << min << ", "
<< std::setprecision(-log10(precision_) + 4) << max << "]..." << endl;
<< " being in [" << std::setprecision(-log10(precision_) + 7) << min << ", "
<< std::setprecision(-log10(precision_) + 7) << max << "]..." << endl;
search_between(min, max);
......@@ -830,10 +838,10 @@ void MOIP::search_between(double lambdaMin, double lambdaMax)
min = lambdaMin;
max = s.get_objective_score(3 - obj_to_solve_);
if (verbose_)
cout << std::setprecision(-log10(precision_) + 4) << "\nSolving objective function " << obj_to_solve_
cout << std::setprecision(-log10(precision_) + 7) << "\nSolving objective function " << obj_to_solve_
<< ", below (or eq. to) " << max << ": Obj" << 3 - obj_to_solve_ << " being in ["
<< std::setprecision(-log10(precision_) + 4) << min << ", "
<< std::setprecision(-log10(precision_) + 4) << max << "]..." << endl;
<< std::setprecision(-log10(precision_) + 7) << min << ", "
<< std::setprecision(-log10(precision_) + 7) << max << "]..." << endl;
search_between(min, max);
}
......
......@@ -37,6 +37,7 @@ class MOIP
void forbid_solutions_between(double min, double max);
IloEnv& get_env(void);
static char obj_function_nbr_; // On what criteria do you want to insert motifs ?
static char obj_function2_nbr_; // Do you want to use MEA or MFE to determine the best energy score ?
static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true)
......
......@@ -61,6 +61,7 @@ int main(int argc, char* argv[])
bool verbose = false;
float theta_p_threshold;
char obj_function_nbr = 'B';
char mea_or_mfe = 'b';
list<Fasta> f;
ofstream outfile;
SecondaryStructure bestSSO1, bestSSO2;
......@@ -80,18 +81,21 @@ int main(int argc, char* argv[])
("jar3dcsv,j", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by biorseo.py")
("bayespaircsv,b", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by biorseo.py")
("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")
("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(2), "Objective to solve in the mono-objective portions of the algorithm")
("output,o", po::value<string>(&outputName), "A file to summarize the computation results")
("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing")
("function,f", po::value<char>(&obj_function_nbr)->default_value('B'), "What objective function to use to include motifs: square of motif size in nucleotides like "
"RNA-MoIP (A), light motif size + high number of components (B), site score (C), light motif size + site score + high number of components (D)")
("MFE,e", "Use as function for objective 2 MFE (Minimum Free Energy)")
("MEA,a", "Use as function for objective 2 MEA (Maximum Expected Accuracy")
("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots")
("limit,l", po::value<unsigned int>(&MOIP::max_sol_nbr_)->default_value(500), "Intermediate number of solutions in the Pareto set above which we give up the calculation.")
("verbose,v", "Print what is happening to stdout");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
basename = remove_ext(inputName.c_str(), '.', '/');
//theta_p_threshold = 0.01;
try {
po::store(po::parse_command_line(argc, argv, desc), vm); // can throw
......@@ -109,6 +113,11 @@ int main(int argc, char* argv[])
cout << "Biorseo v2.0, dockerized, August 2020" << endl;
return EXIT_SUCCESS;
}
if (vm.count("MFE")) mea_or_mfe = 'a';
if (vm.count("MEA")) {
mea_or_mfe = 'b';
cout << "hey" << endl;
}
if (vm.count("verbose")) verbose = true;
if (vm.count("disable-pseudoknots")) MOIP::allow_pk_ = false;
......@@ -134,6 +143,7 @@ int main(int argc, char* argv[])
}
MOIP::obj_function_nbr_ = obj_function_nbr;
MOIP::obj_function2_nbr_ = mea_or_mfe;
/* FILE PARSING */
......@@ -175,6 +185,10 @@ int main(int argc, char* argv[])
// return 0;
//test
/*string best_score;
bool flag = false;*/
if (verbose)
cout << "Solving..." << endl;
try {
......@@ -185,17 +199,22 @@ int main(int argc, char* argv[])
cout << endl << "Best solution according to objective 1 :" << bestSSO1.to_string() << endl;
cout << "Best solution according to objective 2 :" << bestSSO2.to_string() << endl;
}
//test
/*string s = bestSSO1.to_string();
string delimiter = "\t";
best_score = bestSSO1.to_string().substr(s.find(delimiter) + delimiter.size());
best_score = best_score.substr(0, best_score.find(delimiter));*/
// extend the Pareto set on top
if (MOIP::obj_to_solve_ == 1) {
myMOIP.add_solution(bestSSO1);
min = bestSSO1.get_objective_score(2) + MOIP::precision_;
max = bestSSO2.get_objective_score(2);
max = bestSSO2.get_objective_score(2) + MOIP::precision_;
if (verbose) cout << endl << "Solving obj1 on top of best solution 1." << endl;
} else {
myMOIP.add_solution(bestSSO2);
min = bestSSO2.get_objective_score(1) + MOIP::precision_;
max = bestSSO1.get_objective_score(1);
max = bestSSO1.get_objective_score(1) + MOIP::precision_;
if (verbose) cout << endl << "Solving obj2 on top of best solution 2." << endl;
}
......@@ -240,7 +259,6 @@ int main(int argc, char* argv[])
cout << "Best value for Motif insertion objective: " << bestSSO1.get_objective_score(1) << endl;
cout << "Best value for structure expected accuracy: " << bestSSO2.get_objective_score(2) << endl;
}
// Save it to file
if (vm.count("output")) {
if (verbose) cout << "Saving structures to " << outputName << "..." << endl;
......@@ -249,7 +267,20 @@ int main(int argc, char* argv[])
//cout << "----struc----" << endl << myMOIP.solution(0).to_string() << endl;
for (uint i = 0; i < myMOIP.get_n_solutions(); i++) {
outfile << myMOIP.solution(i).to_string() << endl << structure_with_contacts(myMOIP.solution(i)) << endl;
string str1 = myMOIP.solution(i).to_string();
//Check if the best score for obj2 is correctly include in the results
/*string delimiter = "\t";
string obj2 = str1.substr(str1.find(delimiter) + delimiter.size());
obj2 = obj2.substr(0, obj2.find(delimiter));
if (obj2.compare(best_score) == 0)
flag = true;*/
}
/*if (!flag)
cout << "\033[1m\033[31mBest score not find for " << outputName << " !\033[0m" << endl;
else
cout << "OK for "<< outputName << "!" << endl;*/
outfile.close();
}
......