Merge branch 'master' of https://github.com/persalteas/biominserter

Louis BECQUEY
Commit 73734e9d0b9e91817d5d0f62f49c30c191b62ec6 73734e9d 2 parents 30b097cb d26a6b5c
Showing 16 changed files with 288 additions and 195 deletions
.gitignore
README.md
biorseo.py
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/biorseo.cpp
data/sec_structs/applications.dbn
data/sec_structs/verified_tRNAs.dbn
doc/fig/Benchmark_avg.jpg
doc/fig/pseudobase.jpg
doc/fig/pseudobase.png
doc/fig/pseudobase_avg.jpg
doc/fig/pseudobase_zoom.jpg
doc/main_bioinformatics.tex
doc/supplementary_material.tex
test_on_RNAstrand.py
--- a/.gitignore
View file @73734e9
+++ b/.gitignore
View file @73734e9
@@ -19,6 +19,7 @@ bin/*
 
 # results
 results/*
+ benchmark_results/*
 log_of_the_run.sh
 logBadDesc.txt
 gurobi.log
--- a/README.md
View file @73734e9
+++ b/README.md
View file @73734e9
@@ -9,7 +9,7 @@ Contact : louis.becquey@univ-evry.fr
 1/ How it works
 ===================================
 INPUT:
- - An RNA sequence (tested with sequences ~100 bases)
+ - An RNA sequence (with 16 GB of RAM you can go up to ~230 bases)
 
 THEN
 - **Pattern-matching step** : Find all possible occurrences of known RNAmodules in the query sequence, by finding subsequences of the querythat score well with the probabilistic models of the modules (like JAR3D, or BayesPairing)
--- a/biorseo.py
View file @73734e9
+++ b/biorseo.py
View file @73734e9
@@ -21,19 +21,18 @@ bypdir = ""
 biorseoDir = "."
 exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec'))
 runDir = path.dirname(path.realpath(__file__))
- tempDir = biorseoDir + "/temp/"
- HLmotifDir = biorseoDir + "/data/modules/BGSU/HL/3.2/lib"
- ILmotifDir = biorseoDir + "/data/modules/BGSU/IL/3.2/lib"
- descfolder = biorseoDir + "/data/modules/DESC"
+ modulespath = biorseoDir + "/data/modules"
+ HLmotifDir = modulespath + "/BGSU/HL/3.2/lib"
+ ILmotifDir = modulespath + "/BGSU/IL/3.2/lib"
+ descfolder = modulespath + "/DESC"
 
 # Parse options
 try:
-     opts, args = getopt.getopt(sys.argv[1:], "hi:o:", ["rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func="])
- except getopt.GetoptError:
-     print("Please provide arguments !")
+     opts, args = getopt.getopt(sys.argv[1:], "bc:f:hi:jl:no:pt:v", ["verbose", "rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func=","help","version","seq=","modules-path=", "first-objective=","output=","theta=","interrupt-limit="])
+ except getopt.GetoptError as err:
+     print(err)
     sys.exit(2)
 
- 
 m = Manager()
 running_stats = m.list()
 running_stats.append(0) # n_launched
@@ -335,38 +334,75 @@ class BiorseoInstance:
         self.jobcount = 0
         self.joblist = []
         self.mode = 0 # default is single sequence mode
+         self.forward_options = []
 
         for opt, arg in opts:
-             if opt == "-h":
-                 print("biorseo.py -i myRNA.fa -o myRNA.rawB --rna3dmotifs --patternmatch --func B")
-                 print("biorseo.py -i myRNA.fa -o myRNA.jar3dB --3dmotifatlas --jar3d --func B")
-                 print("biorseo.py -i myRNA.fa -o myRNA.bgsubypD --3dmotifatlas --bayespairing --func D")
+             if opt == "-h" or opt == "--help":
+                 print(  "Biorseo, Bi-Objective RNA Structure Efficient Optimizer\n"
+                         "Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules.\n"
+                         "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2019\n\n")
+                 print("Usage:\tYou must provide:\n\t1) a FASTA input file with -i,\n\t2) a module type with --rna3dmotifs or --3dmotifatlas"
+                       "\n\t3) one module placement method in { --patternmatch, --jar3d, --bayespairing }\n\t")
+                 print("Options:")
+                 print("-h [ --help ]\t\tPrint this help message")
+                 print("--version\t\t\tPrint the program version")
+                 print("-i [ --seq ]\t\tFASTA file with the query RNA sequence")
+                 print("-p [ --patternmatch ]\t\tUse regular expressions to place modules in the sequence")
+                 print("-j [ --jar3d ]\t\tUse JAR3D to place modules in the sequence (requires --3dmotifatlas)")
+                 print("-b [ --bayespairing ]\t\tUse BayesPairing to place modules in the sequence")
+                 print("-o [ --output ]\t\tFolder where to output files")
+                 print("-f [ --func ]\t\t(A, B, C or D, default is B)"
+                       "\t\t\t\tObjective function to score module insertions: (A) insert big modules (B) insert light, high-order modules"
+                       "\t\t\t\t(c) insert modules which score well with the sequence (D) insert light, high-order modules which score well with the sequence.")
+                       "\t\t\t\tC and D require cannot be used with --patternmatch.")
+ 
+                 print("biorseo.py -i myRNA.fa -o myResultsFolder/ --rna3dmotifs --patternmatch --func B")
+                 print("biorseo.py -i myRNA.fa -o myResultsFolder/ --3dmotifatlas --jar3d --func B")
+                 print("biorseo.py -i myRNA.fa --3dmotifatlas --bayespairing --func D")
                 sys.exit()
-             elif opt == "-i":
+             elif opt == "-i" or opt == "--seq":
                 self.inputfile = arg
-             elif opt == "-o":
+             elif opt == "-o" or opt == "--output":
                 self.outputf = arg # output file or folder...
                 if self.outputf[1] != '/':
                     self.outputf = getcwd() + '/' + self.outputf
                 if self.outputf[-1] != '/':
                     self.outputf = self.outputf + '/'
-             elif opt == "--func":
+             elif opt == "-f" or opt == "--func":
                 if arg in ['A', 'B', 'C', 'D']:
                     self.func = arg
                 else:
                     raise "Unknown scoring function " + arg
-             elif opt == "--patternmatch":
+             elif opt == "-p" or opt == "--patternmatch":
                 self.type = "dpm"
-             elif opt == "--jar3d":
+             elif opt == "-j" or opt == "--jar3d":
                 self.type = "jar3d"
-             elif opt == "--bayespairing":
+             elif opt == "-b" or opt == "--bayespairing":
                 self.type = "byp"
             elif opt == "--rna3dmotifs":
                 self.modules = "desc"
             elif opt == "--3dmotifatlas":
                 self.modules = "bgsu"
-             else:
-                 raise "Unknown option " + opt
+             elif opt == "--modulespath":
+                 HLmotifDir = arg + "/HL/3.2/lib"
+                 ILmotifDir = arg + "/IL/3.2/lib"
+                 descfolder = arg
+             elif opt == "--version":
+                 subprocess.call([biorseoDir+"/bin/biorseo", "--version"])
+                 exit(0)
+             elif opt == "-l" or opt == "--interrupt-limit":
+                 self.forward_options.append("-l")
+                 self.forward_options.append(arg)
+             elif opt == "-v" or opt == "--verbose":
+                 self.forward_options.append("-v")
+             elif opt == "-n" or opt == "--disable-pseudoknots":
+                 self.forward_options.append("-n")
+             elif opt == "-t" or opt == "--theta":
+                 self.forward_options.append("-t")
+                 self.forward_options.append(arg)
+             elif opt == "-c" or opt == "--first-objective":
+                 self.forward_options.append("-c")
+                 self.forward_options.append(arg)
 
         print("saving files to", self.outputf)
         # create jobs
@@ -793,7 +829,8 @@ class BiorseoInstance:
             command = [executable, "-s", fastafile ]
             if method_type:
                 command += [ method_type, csv ]
-             command += [ "-o", self.outputf + instance.header + ext + self.func, "--type", self.func ]
+             command += [ "-o", self.outputf + instance.header + ext + self.func, "--function", self.func ]
+             command += self.forward_options
             self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3))
             
 
--- a/cppsrc/MOIP.cpp
View file @73734e9
+++ b/cppsrc/MOIP.cpp
View file @73734e9
@@ -21,6 +21,7 @@ char   MOIP::obj_function_nbr_ = 'A';
 uint   MOIP::obj_to_solve_     = 1;
 double MOIP::precision_        = 1e-5;
 bool   MOIP::allow_pk_         = true;
+ uint   MOIP::max_sol_nbr_      = 500;
 
 unsigned getNumConstraints(IloModel& m)
 {
@@ -499,8 +500,8 @@ void MOIP::add_solution(const SecondaryStructure& s)
 {
     if (verbose_) cout << "\t>adding structure to Pareto set :\t" << s.to_string() << endl;
     pareto_.push_back(s);
-     if (pareto_.size() > 500) {
-         cerr << "\033[31m Quitting because combinatorial issues (>500 solutions in Pareto set). \033[0m" << endl;
+     if (pareto_.size() > max_sol_nbr_) {
+         cerr << "\033[31m Quitting because combinatorial issues (>" << max_sol_nbr_ << " solutions in Pareto set). \033[0m" << endl;
         exit(1);
     }
 }
--- a/cppsrc/MOIP.h
View file @73734e9
+++ b/cppsrc/MOIP.h
View file @73734e9
@@ -30,7 +30,8 @@ class MOIP
     static uint               obj_to_solve_;  // What objective do you prefer to solve in mono-objective portions of the algorithm ?
     static double             precision_;   // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
     static bool               allow_pk_;      // Wether we forbid pseudoknots (false) or allow them (true)
- 
+     static uint               max_sol_nbr_;  // Number of solutions to accept in the Pareto set before we give up the computation
+     
     private:
     bool   is_undominated_yet(const SecondaryStructure& s);
     void   define_problem_constraints(void);
--- a/cppsrc/biorseo.cpp
View file @73734e9
+++ b/cppsrc/biorseo.cpp
View file @73734e9
@@ -73,14 +73,15 @@ int main(int argc, char* argv[])
     ("version", "Print the program version")
     ("seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")
     ("descfolder,d", po::value<string>(&motifs_path_name), "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")
-     ("jar3dcsv", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
-     ("bayespaircsv", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
+     ("jar3dcsv,j", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
+     ("bayespaircsv,b", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
     ("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")
     ("output,o", po::value<string>(&outputName), "A file to summarize the computation results")
     ("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing")
-     ("type,f", po::value<char>(&obj_function_nbr)->default_value('A'), "What objective function to use to include motifs: square of motif size in nucleotides like "
-     "RNA-MoIP (A), motif size + number of components (B), site score (C), motif size + site score + number of components (D)")
+     ("function,f", po::value<char>(&obj_function_nbr)->default_value('B'), "What objective function to use to include motifs: square of motif size in nucleotides like "
+     "RNA-MoIP (A), light motif size + high number of components (B), site score (C), light motif size + site score + high number of components (D)")
     ("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots")
+     ("limit,l", po::value<unsigned int>(&MOIP::max_sol_nbr_)->default_value(500), "Intermediate number of solutions in the Pareto set above which we give up the calculation.")
     ("verbose,v", "Print what is happening to stdout");
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, desc), vm);
@@ -99,7 +100,7 @@ int main(int argc, char* argv[])
             return EXIT_SUCCESS;
         }
         if (vm.count("version")) {
-             cout << "Biorseo v1.0, May 2019" << endl;
+             cout << "Biorseo v1.01, June 2019" << endl;
             return EXIT_SUCCESS;
         }
         if (vm.count("verbose")) verbose = true;
@@ -112,7 +113,7 @@ int main(int argc, char* argv[])
             return EXIT_FAILURE;
         }
         if (vm.count("-d") and (obj_function_nbr == 'C' or obj_function_nbr == 'D')) {
-             cerr << "\033[31mYou must provide --jar3dcsv or --bayespaircsv to use --type C or --type D.\033[0m See "
+             cerr << "\033[31mYou must provide --jar3dcsv or --bayespaircsv to use --function C or --function D.\033[0m See "
                     "--help for more "
                     "information."
                  << endl;
--- a/data/sec_structs/applications.dbn
View file @73734e9
+++ b/data/sec_structs/applications.dbn
View file @73734e9
@@ -6,4 +6,4 @@ GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
 (((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).
 >__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857)
 GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
- [[[[[[........(((((((((]]]]]]........))))))))).
\ No newline at end of file
+ [[[[[[........(((((((((]]]]]]........))))))))).
--- a/data/sec_structs/verified_tRNAs.dbn deleted 100644 → 0
View file @30b097c
+++ b/data/sec_structs/verified_tRNAs.dbn deleted 100644 → 0
View file @30b097c
- >>__'CRYSTAL_STRUCTURE_ANALYSIS_OF_A_TRNA-NEOMYCIN_COMPLEX_'_(PDB_00095)
- GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGACUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA
- (((((((..((((........))))(((((........)))))....(((((.......))))))))))))....
- >>__'YEAST_INITIATOR_TRNA_'_(PDB_00229)
- AGCGCCGUGGCGCAGUGGAAGCGCGCAGGGCUCAUAACCCUGAUGUCCUCGGAUCGAAACCGAGCGGCGCUACCA
- (((((((..((((.......))))(((((((.....)))))))....(((((.......))))))))))))....
- >>__'THE_CRYSTAL_STRUCTURE_OF_PHENYLALANYL-TRNA_SYNTHETASE_FROM_THERMUS_THERMOPHILUS_COMPLEXED_WITH_COGNATE_TRNAPHE_'_(PDB_00362)
- GCCGAGGUAGCUCAGUUGGUAGAGCAUGCGACUGAAAAUCGCAGUGUCCGCGGUUCGAUUCCGCGCCUCGGCACCA
- (((((((..((((........))))((((((.......))))))....(((((.......))))))))))))....
- >>__'CRYSTAL_STRUCTURE_OF_GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_A_TRNA-GLN_MUTANT_AND_AN_ACTIVE-SITE_INHIBITOR_'_(PDB_00373)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAGCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......))))((((((((...))))))))..(((((.......))))))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_A_TRNA_MUTANT_AND_AN_ACTIVE_SITE_INHIBITOR_'_(PDB_00374)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAAGCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......))))((((((((...))))))))...(((((.......))))))))))).....
- >>__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....
- >>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00402)
- GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
- (((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
- >>__'STRUCTURAL_BASIS_OF_ANTICODON_LOOP_RECOGNITION_BY_GLUTAMINYL-TRNA_SYNTHETASE_'_(PDB_00425)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........)))((((((((...))))))))....(((((.......))))))))))).....
- >>__'STRUCTURAL_BASIS_FOR_TRANSFER_RNA_AMINOACEYLATION_BY_ESCHERICHIA_COLI_GLUTAMINYL-TRNA_SYNTHETASE_'_(PDB_00426)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........))).(((((((...))))))).....(((((.......))))))))))).....
- >>__'CRYSTAL_STRUCTURE_OF_ARCHAEAL_TYROSYL-TRNA_SYNTHETASE_COMPLEXED_WITH_TRNA(TYR)_AND_L-TYROSINE_'_(PDB_00474)
- CCGGCGGUAGUUCAGCCUGGUAGAACGGCGGACUGUAGAUCCGCAUGUCGCUGGUUCAAAUCCGGCCCGCCGGA
- (((((((..((((.........))))(((((((.....)))))))....(((((.......)))))))))))).
- >>__'CRYSTAL_STRUCTURE_OF_L-GLUTAMINE_AND_AMPCPP_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00620)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......))))(((((((.....)))))))....(((((.......))))))))))).....
- >>__'CRYSTAL_STRUCTURE_OF_L-GLUTAMATE_AND_AMPCPP_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00621)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
- >>__'CRYSTAL_STRUCTURE_OF_THE_TRNA_DOMAIN_OF_TRANSFER-MESSENGER_RNA_IN_COMPLEX_WITH_SMPB_'_(PDB_00637)
- GAUUCGACGGGGACUUCGGUCCUCGGACGCGGGUUCGAUUCCCGCUCGACGGGGACUUCGGUCCUCGGA
- ......((((((((....))))))))..(((((.......)))))...((((((((....)))))))).
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_D235N_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00669)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........)))((((((((...))))))))....(((((.......))))))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_D235G_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00670)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........)))(((((((.....)))))))....(((((.......))))))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_I129T_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00671)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..(((.........)))((((((.......))))))....(((((.......))))))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_TRNA_AND_AN_AMINO_ACID_ANALOG_'_(PDB_00672)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......))))((((((((...))))))))....(((((.......))))))))))).....
- >>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00673)
- GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
- (((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
- >>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00674)
- GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
- (((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
- >>__'CRYSTAL_STRUCTURE_OF_CYSTEINYL-TRNA_SYNTHETASE_BINARY_COMPLEX_WITH_TRNACYS_'_(PDB_00742)
- GGCGCGUUAACAAAGCGGUUAUGUAGCGGAUUGCAAAUCCGUCUAGUCCGGUUCGACUCCGGAACGCGCCUCCA
- (((((((..(((.........))).((((((.....))))))....(((((.......))))))))))))....
- >>__'CRYSTAL_STRUCTURE_OF_TRNA_NUCLEOTIDYLTRANSFERASE_COMPLEXED_WITH_A_PRIMER_TRNA_AND_AN_INCOMING_ATP_ANALOG_'_(PDB_00767)
- GGCCAGGGGCGGUUCGAUUCCGCCCCUGGCCGGCCAGGGGCGGUUCGAUUCCGCCCCUGGCCACCAA
- (((((((.(((.........))).)))))))(((((((.(((.........))).))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_TO_GLUTAMINE_AND_2\'DEOXY_A76_GLUTAMINE_TRNA_'_(PDB_00901)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
- >>__'IF2,_IF1,_AND_TRNA_FITTED_TO_CRYO-EM_DATA_OF_E._COLI_70S_INITIATION_COMPLEX_'_(PDB_00903)
- GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA
- (((((((..((((........))))((((((.......))))))....(((((.......))))))))))))....
- >>__'STRUCTURE_OF_HUMAN_TRYPTOPHANYL-TRNA_SYNTHETASE_IN_COMPLEX_WITH_TRNA(TRP)_'_(PDB_00926)
- GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCA
- ((.((((..(((((.....)))))...((((.....)))).......(((((.......))))))))).)).
- >>__'CRYSTAL_STRUCTURE_OF_STAPHYLOCOCCUS_AUREUS_TRNA_ADENOSINE_DEAMINASE,_TADA,_IN_COMPLEX_WITH_RNA_'_(PDB_00943)
- UUGACUACGGAUCAAUUGACUACGGAUCAAGACUACGGUUUGACUACGGAUCAA
- (((((.....)))))(((((.....))))).........(((((.....)))))
- >>__'COCRYSTAL_STRUCTURE_OF_AN_RNA_SULFURATION_ENZYME_MNMA_AND_TRNA-GLU_IN_THE_PRE-REACTION_STATE_'_(PDB_00999)
- GUCCCCUUCGUCUAGAGGCCCAGGACACCGCCCUUUCACGGCGGUAACAGGGGUUCGAAUCCCCUAGGGG
- ..((.....(((((.......)))))(((((((.....)))))))....((((.......))))...)).
- >>__'STRUCTURE_OF_HUMAN_TRYPTOPHANYL-TRNA_SYNTHETASE_IN_COMPLEX_WITH_TRNA(TRP)_'_(PDB_01002)
- GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCACCA
- (((((((....(((.....)))...((((((.....)))))).....(((((.......))))))))))))....
- >>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_COMPLEXED_WITH_TRNACYS_AND_O-PHOSPHOSERINE_'_(PDB_01009)
- GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUGCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
- ((((((([[(((]].......)))..((((.......))))......(((((.......))))))))))))
- >>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_COMPLEXED_WITH_TRNACYS_'_(PDB_01010)
- GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUGCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
- ..((((([[(((]].......)))..(((((.....)))))......(((((.......))))))))))..
- >>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_E418N/E420N_MUTANT_COMPLEXED_WITH_TRNAOPAL_AND_O-PHOSPHOSERINE_("OPAL_COMPLEX")_'_(PDB_01011)
- GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUUCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
- (((((((..(((.........))).(((((.......))))).....(((((.......))))))))))))
- >>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_E418N/E420N_MUTANT_COMPLEXED_WITH_TRNAAMBER_AND_O-PHOSPHOSERINE_("AMBER_COMPLEX")_'_(PDB_01012)
- GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUCUAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
- (((((((..(((.........))).(((((.......))))).....(((((.......))))))))))))
- >>__'CRYSTAL_STRUCTURE_OF_RNASE_Z/TRNA(THR)_COMPLEX_'_(PDB_01053)
- GCUUCCAUAGCUCAGCAGGUAGAGCGUCAGCGGUUCGAGCCCGCUUGGAAGCU
- (((((((..((((........))))...(((((.......)))))))))))).
- >>__'PHENYLALANYL-TRNA_SYNTHETASE_FROM_THERMUS_THERMOPHILUS_COMPLEXED_WITH_TRNA_AND_A_PHENYLALANYL-ADENYLATE_ANALOG_'_(PDB_01134)
- GCCGAGGUAGCUCAGUUGGUAGAGCAUGCGACUGAAAAUCGCAGUGUCCGCGGUUCGAUUCCGCGCCUCGGCACCA
- (((((((....((........))...((((((.....)))))).....((((.........)))))))))))....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_C229R_WITH_BOUND_ANALOG_5\'-O-[N-(L-GLUTAMINYL)-SULFAMOYL]ADENOSINE_'_(PDB_01258)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
- >>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_C229R_WITH_BOUND_ANALOG_5\'-O-[N-(L-GLUTAMYL)-SULFAMOYL]ADENOSINE_'_(PDB_01259)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
- ((((((..((((.......))))((((((((...))))))))....(((((.......))))))))))).....
--- a/doc/fig/Benchmark_avg.jpg
View file @73734e9
+++ b/doc/fig/Benchmark_avg.jpg
View file @73734e9
--- a/doc/fig/pseudobase.jpg deleted 100644 → 0
View file @30b097c
+++ b/doc/fig/pseudobase.jpg deleted 100644 → 0
View file @30b097c
--- a/doc/fig/pseudobase.png deleted 100644 → 0
View file @30b097c
+++ b/doc/fig/pseudobase.png deleted 100644 → 0
View file @30b097c
--- a/doc/fig/pseudobase_avg.jpg
View file @73734e9
+++ b/doc/fig/pseudobase_avg.jpg
View file @73734e9
--- a/doc/fig/pseudobase_zoom.jpg 0 → 100644
View file @73734e9
+++ b/doc/fig/pseudobase_zoom.jpg 0 → 100644
View file @73734e9
--- a/doc/main_bioinformatics.tex
View file @73734e9
+++ b/doc/main_bioinformatics.tex
View file @73734e9
@@ -23,9 +23,9 @@
 
 \abstract{\textbf{Motivation:} RNA loops have now been modelled and clustered from solved 3D structures into ordered collections of recurrent non-canonical interactions called "RNA modules", available in databases. This work explores what information from such modules can be used to improve secondary structure prediction.\\
 \textbf{Results:} We propose a Pareto-based method for predicting RNA secondary structures by minimizing a bi-objective both energy-based and knowledge-based potential. The tool, called \textsc{Biorseo}, outputs the secondary structures from the Pareto set. We use it to compare several approaches to predict secondary structures using inserted RNA modules information: two different module data sources, Rna3Dmotifs and The RNA 3D Motif Atlas, and different ways to score the module insertions are compared: module size, module complexity, or module probability according to models like JAR3D and BayesPairing. We benchmark them against 344 known secondary structures. Some of the tested methods present a good performance, especially on structures containing pseudoknots. They are compared to state of the art tools for secondary structure prediction.\\
- \textbf{Availability:} The software is freely provided for Linux on \href{https://github.com/persalteas/biorseo/}{GitHub (https://github.com/persalteas/biorseo/)}, with the datasets. \\
+ \textbf{Availability:} The software is freely provided for Linux download on \href{https://evryrna.ibisc.univ-evry.fr/evryrna/biorseo/}{EvryRNA}, with the datasets. \\
 \textbf{Contact:} \href{louis.becquey@univ-evry.fr}{louis.becquey@univ-evry.fr}\\
- \textbf{Supplementary information:} Supplementary data are available at \textit{Bioinformatics}
+ \textbf{Supplementary information:} Appendices A,B and C are available at \textit{Bioinformatics}
 online.}
 
 \maketitle
@@ -59,7 +59,8 @@ One hypothesis about RNA-MoIP's lack of performance is that it cannot distinguis
 
 To test this hypothesis, we design a method which builds a 2D structure by simultaneously placing base-pairs and modules in a single step, taking into account two objectives: the expected accuracy of the structure in the equilibrium ensemble fold, and a custom function that reflects the number and quality of inserted modules (several models are studied). This method leads to our new tool Biorseo (Bi-Objective RNA Structure Efficient Optimizer). Our approach avoids using a weighted linear combination of the objectives as done in RNA-MoIP (which can miss interesting solutions so-called \textit{non-supported} solutions). In this paper, we use a bi-objective Pareto-based approach, i.e. we identify all the non-dominated structures (the structures for which no other structure scores better on the two objectives).
 
- In the next section, this article presents the module models sources, insertion models and objective functions, and the procedure to compare them. Then we present a benchmark of all those variants against reference tools in Section \ref{sec:results}, using a reference dataset (verified structures from the RNA-Strand database). Three well-known reference RNAs are predicted and used to discuss the differences between the methods: \textit{E. coli}'s Gln tRNA, the Guanine riboswitch, and the human telomerase's pseudoknot. Finally, we recommend two prediction methods, and conclude.
+ In the next section, this article presents the module models sources, insertion models and objective functions, and the procedure to compare them. Then we present a benchmark of all those variants against reference tools in Section \ref{sec:results}, using a reference dataset (verified structures from the RNA-Strand database). Three well-known reference RNAs are predicted and used to discuss the differences between the methods. %: \textit{E. coli}'s Gln tRNA, the Guanine riboswitch, and the human telomerase's pseudoknot. 
+ Finally, we recommend two prediction methods, and conclude.
 
 \begin{figure*}[t]
    \includegraphics[width=\textwidth]{fig/graph_abstract.jpg} 
@@ -79,7 +80,7 @@ Our main procedure is the following:
  \item \textbf{Optimization step:} Find a secondary structure that satisfies as much as possible both the expected accuracy of the structure and a criterion taking into account module inclusions, by solving a bi-objective integer linear programming problem, using the previous constraints defined in the previous step.
 \end{itemize}
   
- The linear integer programming framework used to define the constraints and solve the resulting optimization problem is similar to previous works like IPknot, Biokop or RNA-MoIP~(\citealp{sato_ipknot:_2011,legendre_bi-objective_2018,reinharz_towards_2012}), but involves new constraints detailed in supplementary material. 
+ The linear integer programming framework used to define the constraints and solve the resulting optimization problem is similar to previous works like IPknot, Biokop or RNA-MoIP~(\citealp{sato_ipknot:_2011,legendre_bi-objective_2018,reinharz_towards_2012}), but involves new constraints detailed in appendix A. 
 Figure \ref{fig:pipeline} summarizes the procedure on a graphical pipeline.
    
 \subsection{Pattern matching step}\label{sec:models}
@@ -93,8 +94,8 @@ Several methods have been proposed to tackle the issue of finding if a sequence 
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- \subsection{Constraints definition step and integer programming model} \label{sec:ip}
- The full list of variables we used to model the problem in an integer linear program and the linear formulation of each constraint are detailed in supplementary material. Here we propose different objective functions to maximize, whose performances are compared in section \ref{sec:results}.
+ \subsection{Constraints definition step and IP model} \label{sec:ip}
+ The full list of variables we used to model the problem in an integer linear program and the linear formulation of each constraint are detailed in Appendix A. Here we propose different objective functions to maximize, whose performances are compared in section \ref{sec:results}.
 
 \paragraph{Notations} ~ We call a \textit{component} a piece of strand which forms an unpaired portion of a module. Components of a module are linked together by canonical base-pairs at their extremities to form a loop. Let $x$ be a module which could be inserted at some defined position in the sequence. Let $\|x\|$ bet the number of components of this module, and $k_{x,i}$ the nucleotide count of the $i$th component of $x$. When a scoring model is used (JAR3D or BayesPairing), we denote $p(x)$ the score value of $x$ inserted at the defined position. Let $p_{uv}$ be the probability for nucleotides $u$ and $v$ (with $v>u+3$) to form a canonical base-pair. We use NUPACK's dynamic programming scheme~(\citealp{dirksAlgorithmComputingNucleic2004}), which supports pseudoknots, to compute such probabilities. We denote $y^u_v$ the binary decision variable indicating that these nucleotides do form a canonical base pair, and $C^x_1$ the decision binary variable indicating whether the module $x$ will be inserted or not. The resolution of the linear program outputs solutions by fixing definitive values for the different $y^u_v$ and $C^x_1$. 
 
@@ -105,7 +106,7 @@ Let $X$ be the set of all our decision variables, then the different objective f
 \begin{equation} f_{1C}(X) = \sum_{x} p(x) \times C^x_1 \label{eq:C}\end{equation}
 \begin{equation}f_{1D}(X) = \sum_{x} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\label{eq:D}\end{equation}
 
- Regarding the second objective, aimed at maximizing the expected accuracy of the structures, we use $f_2(X) = \sum y^u_v \times p_{uv} \times I[p_{uv}>\theta]$. As first proposed by~(\citealp{sato_ipknot:_2011}), $f_2$ uses a parameter $\theta$ to ignore very unlikely base-pairs. This prevents the explosion of the number of variables and allows a fast resolution of the IP problem.
+ Regarding the second objective, aimed at maximizing the expected accuracy of the structures, we use $f_2(X) = \sum y^u_v \times p_{uv} \times I[p_{uv}>\theta]$. As first proposed by~(\citealp{sato_ipknot:_2011}), $f_2$ uses a parameter $\theta = 0.001$ to ignore very unlikely base-pairs. This prevents the explosion of the number of variables and allows a fast resolution of the IP problem.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -159,67 +160,53 @@ $s$:= maximize($f_1$, $\lambda_{min}$, $\lambda_{max}$, F)\;
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Results}\label{sec:results}
- 
 \begin{figure*}[!tbp]
    \includegraphics[width=\textwidth]{fig/Benchmark.jpg}
    \caption{Boxplots of the best MCC over the proposed solutions for each of the RNAs, for all method variants. Line (A) shows the methods that cannot find pseudoknots: RNAsubopt, RNA-MoIP, and the 14 variants of Biorseo's bi-objective methods with a constraint that explicitly forbids pseudoknots. Line (B) shows methods which allow their prediction: Biokop, and the 14 variants without the no-pseudoknot constraint. The left block gathers methods which use module data from Rna3Dmotifs (\citealp{djelloul_automated_2008}). The right one gathers those which use modules from the RNA 3D Motif Atlas (\citealp{petrov_automated_2013}). Boxplots surrounded by a dotted red frame use direct pattern matching to detect insertion sites, but do not score the sites. Those surrounded by a continuous blue frame score the sites with JAR3D (\citealp{zirbel_identifying_2015}) to score modules on loop sequences found by RNAsubopt. The remaining surrounded by a dashed green frame use the BayesPairing score~(\citealp{sarrazin2019automated}).}
    \label{fig:upgrades}
 \end{figure*}
- 
 All the methods introduced return an ensemble of possible secondary structures for a given input sequence. We compare them in a benchmark over short RNA structures and a few well-known application cases. 
- 
 \subsection{Benchmark protocol} \label{sec:bench}
- We repeated the benchmark twice: first, by forbidding explicitly the formation of pseudoknots with additional constraints (for fair comparison with RNA-MoIP). Then, a second one without such limitation, to reach maximum performance.
 
 \paragraph{Materials}
- The tool has been conceived to be used by a regular scientist on a desktop computer. All computations were run on a workstation with AMD Ryzen 2700X (16 threads @4.3GHz) CPU, 16 GB of RAM. A prediction typically takes a few seconds, sometimes minutes. The time required grows with both the nucleotide count and the number of loops. The objective functions $f_{1A}$ and $f_{1B}$ were sometimes not discriminative enough and equally ranked a large number of module propositions, leading to combinatorial issues. For that reason, we arbitrarily stopped the jobs exceeding 500 structures in the Pareto set, because they would require over 2 hours of computation time to complete. 
+ The tool can be used by a regular scientist on a desktop computer. All computations were performed on a workstation (16 threads @4.3GHz) CPU, 16 GB of RAM. The RAM typically limits the size of the RNAs the methods can process. RNAs up to 230 bases are fine in our case. A prediction typically takes a few seconds, sometimes minutes. The time required grows with both the nucleotide count and the number of loops. The objective functions $f_{1A}$ and $f_{1B}$ were sometimes not discriminative enough and equally ranked a large number of module propositions, leading to combinatorial issues. For that reason, we arbitrarily stopped the jobs exceeding 500 structures in the Pareto set, because they would require over 2 hours of computation time to complete. 
 
 \paragraph{Data sources} \label{sec:data}
- A set of RNA secondary structures was extracted from the RNA-Strand database ~(\citealp{andronescu2008rna}). We selected the RNAs for which experimental proof of the structure exists, with size varying between 10 and 100 nucleotides. Sequences containing consensus letters, for example R for a purine (A or G), or modified nucleotides (P, T, I in our case) were discarded. %In addition, we add a collection of 264 pseudoknotted RNAs taken from the Pseudobase database~(\citealp{van2000pseudobase}), covering all pseudoknot families, of length between 10 and 100 nucleotides. 
- The final dataset contains 334 secondary structures of various RNA families, 74 of them containing pseudoknots. Due to the combinatorial issues described above, only 291 (resp. 294) of them have been predicted by all the proposed methods (the missing ones often being a combination of direct pattern-matching or JAR3D with $f_{1A}$ or $f_{1B}$) when forbidding (resp. allowing) pseudoknots. We will use these set of results for comparison.
+ A first dataset of RNA secondary structures was extracted from the RNA-Strand database ~(\citealp{andronescu2008rna}). We selected the RNAs for which experimental proof of the structure exists, with size varying between 10 and 100 nucleotides. Sequences containing modified nucleotides (P, T, I in our case) were discarded. The resulting set contains 334 secondary structures of various RNA families, 74 of them containing pseudoknots. We repeated the experiments twice: first, by forbidding explicitly the formation of pseudoknots with additional constraints (for fair comparison with RNA-MoIP). Then, a second one without such limitation, to reach maximum performance. Due to the combinatorial issues described above, only 291 (resp. 294) of the RNAs have been predicted by all the proposed methods (the missing ones often being a combination of direct pattern-matching or JAR3D with $f_{1A}$ or $f_{1B}$) when forbidding (resp. allowing) pseudoknots. We will use these set of results for comparison. \\
+ In addition, we add a second collection of 264 pseudoknotted-only RNAs from the Pseudobase database~(\citealp{van2000pseudobase}), covering all pseudoknot families, of length also comprised between 10 and 100 nucleotides.\\
+ To complete the large benchmark, we have a deeper look at very-well known structures to check if relevant combinations of models are still able to predict them correctly.
+ We used a Gln tRNA from E. coli (RNA-Strand code PDB\_00376), a Guanine riboswitch (RNA-Strand code PDB\_01023), and the pseudoknot of the human telomerase (PDB\_00857). The tRNA is unpseudoknotted, the G riboswitch contains a hard-to-predict HHH type pseudoknot, and the telomerase pseudoknot is a simple H type pseudoknot.
 
- In addition to the large benchmark, we have a deeper look at very-well known structures to check if relevant combinations of models are still able to predict them correctly.
- We used a Gln tRNA from E. coli (RNA-Strand code PDB\_00376), a Guanine riboswitch (RNA-Strand code PDB\_01023), and the pseudoknot of the human telomerase (PDB\_00857). The tRNA contains a tiny, improbable pseudoknot. The G riboswitch is unpseudoknotted, and the telomerase pseudoknot is a simple H type pseudoknot.
- 
-  
 \paragraph{Reference comparison methods}
 To study the usefulness of the data sources, objective functions, and module placement methods, we added state-of-the art tools to the comparison. The same RNA sequences were submitted to RNA-MoIP for direct performance comparison. We used RNAsubopt as a reference method without pseudoknot support, because it is fast, widely used, easy to understand and returns several solutions. We used Biokop, the bi-objective integer programming framework, as a reference method for prediction of secondary structures with pseudoknots. Both tools over-perform other state-of-the-art tools in their respective categories, see the appropriate papers (\citealp{lorenz2011viennarna, legendre_bi-objective_2018}) for more benchmarks against other tools.
 
- \paragraph{Metrics} ~ We compute the Matthews correlation coefficient (MCC) between the real secondary structure and every proposition. The coefficient is defined as
+ \paragraph{Metrics} ~ We compute the Matthews correlation coefficient (MCC) between the real secondary structure and every proposed structure. The coefficient is defined as
 \begin{equation}
    MCC = \frac{TP. TN - FP. FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}. \label{eq:MCC}
 \end{equation}
- Then, we keep the best MCC value found as a metric of the method's performance. This reflects if the true structure is included or not in the Pareto set. Averaging over the structures would be a nonsense, because some RNAs may exist in several meta-stable states which are very different in their list of base-pairs. Here, we measure if one of the states, which is reported in RNA-Strand as "true" structure, has been found. The choice of MCC over accuracy or F1 score is justified by the very large difference between the size of the classes: there exist much more negative base-pairs (pairs of nucleotides that do not interact) than positive ones in any secondary structure.
- 
- \begin{table*}[!t]
- \processtable{MCC results for study cases. Pseudoknots are allowed. \label{Tab:01}} {
- \begin{tabular}{@{}rlllllll@{}}\toprule & RNAsubopt & RNA-MoIP & BiokoP & Rna3Dmotifs & Rna3Dmotifs & RNA 3D Motif Atlas & RNA 3D Motif Atlas\\
-  & & & & + Direct P.M. & + BayesPairing & + JAR3D & + BayesPairing \\\midrule
- PDB\_00376 & 0.68 & 0.68 & 0.67 & 0.72 (A,B) & 0.74 (B,C,D), 0.71 (A) & 0.74 (A,C,D), 0.72 (B) & 0.76 (\textit{all})\\
- PDB\_01023 & 0.86 & 0.86 & 0.59 & 0.79 (A,B) & 0.29 (\textit{all}) & 0.82 (\textit{all}) & 0.82 (\textit{all})\\
- PDB\_00857 & 0.77 & 0.77 & 1.0 & 0.97 (B), 0.77 (A) & 0.97 (\textit{all}) & 0.97 (\textit{all}), & 0.97 (\textit{all})\\\botrule
- \end{tabular}}{The first line gives the results for E.\textit{coli} tRNA Gln  (PDB\_00376), the second line for the glycine riboswitch (PDB\_01023), and the third for the human telomerase's pseudoknot (PDB\_00857). We observe that the best structure is often the same when we use the different objective functions $f_{1A}, f_{1B}, f_{1C}, f_{1D}$, but the rest of the set can be different.}
- \end{table*}
- 
+ Then, we keep the best MCC value found over the set of proposed structures as a metric of the method's performance. This reflects if the true structure is included or not in the Pareto set. Here, we measure if one of the states, which is reported in RNA-Strand as "true" structure, has been found. For comprehensiveness, results with average MCC are also provided in Appendix B, but it is hard to interpret what this average MCC represents. The choice of MCC over accuracy or F1 score is justified by the very large difference between the size of the classes: there exist much more negative base-pairs (pairs of nucleotides that do not interact) than positive ones in any secondary structure.
 
 \subsection{Benchmark results}
 Performance results under the form of best MCC are summarized in Figure \ref{fig:upgrades}.
 Majority of the RNAs were predicted with similar performance among the methods, including methods that do not use module information. 
 
- No data source, nor objective function taken alone performs significantly better than the other ones. No one distinguishes itself alone to improve the performance. 
+ No data source, nor objective function taken alone performs significantly better than the other ones. 
 
- \paragraph{Methods without pseudoknots, comparison to RNAsubopt} ~ No method reaches RNAsubopt's scores. The most performing model is the use of The RNA 3D Motif Atlas modules, placed with JAR3D, and scored only with the JAR3D score ($f_{1C}$). We also notice that the 4 models (including this one) which use JAR3D return very small sets of results, most of them being one optimal solution, while RNAsubopt returns from one to ten solutions (with our dataset and default settings).
+ \paragraph{Methods without pseudoknots, comparison to RNAsubopt} ~ No method reaches RNAsubopt's scores. The most performing model is the use of the RNA 3D Motif Atlas modules, placed with JAR3D, and scored only with the JAR3D score ($f_{1C}$). We also notice that the 4 models (including this one) which use JAR3D return very small sets of results, most of them being one optimal solution, while RNAsubopt returns from one to ten solutions (with our dataset and default settings).
 
- \paragraph{With pseudoknots, comparison to Biokop} ~ The number of solutions returned doubles for every method compared to its no-pseudoknot version. Most of the RNAs are predicted with small knots when the method allows it. As the bottom line of Figure \ref{fig:upgrades} shows, the methods which use BayesPairing do not find as many right structures than  Biokop, which performs better even without module information. On the other hand, the methods which use direct pattern-matching (like RNA-MoIP did) and the four ones which use RNAsubopt+JAR3D reach higher performance. The largest improvements concern Biorseo + Rna3Dmotifs + $f_{1B}$ and Biorseo + The RNA 3D Motif Atlas + JAR3D + $f_{1B}$. We apply Wilcoxon signed rank tests (non-parametric test for paired samples) to assert these methods distributions of results differ from Biokop (null hypothesis: "The position parameter of the distribution of the differences between the two paired samples is null"): p-values are $1.5\times 10^{-2}$ for the first method using Rna3Dmotifs, and $2.5\times 10^{-3}$ for the second with JAR3D. Other methods have more statistically significant differences with BiokoP, but smaller improvements in average, so we do not provide all the less interesting statistical tests results.
+ \paragraph{With pseudoknots, comparison to Biokop} ~ The number of solutions returned doubles for every method compared to its no-pseudoknot version. Most of the RNAs are predicted with small knots as the method allows it. As the bottom line of Figure \ref{fig:upgrades} shows, the methods which use BayesPairing do not find as many right structures than  Biokop, which performs better even without module information. On the other hand, the methods which use direct pattern-matching (like RNA-MoIP did) and the four ones which use RNAsubopt+JAR3D reach higher performance. The largest improvements concern Biorseo + Rna3Dmotifs + $f_{1B}$ and Biorseo + The RNA 3D Motif Atlas + JAR3D + $f_{1B}$. We apply Wilcoxon signed rank tests (non-parametric test for paired samples) to assert these methods distributions of results differ from Biokop (null hypothesis: "The position parameter of the distribution of the differences between the two paired samples is null"): p-values are $1.5\times 10^{-2}$ for the first method using Rna3Dmotifs, and $2.5\times 10^{-3}$ for the second with JAR3D. Other methods have smaller improvements in average, so we do not provide all the statistical tests results.
 
- \subsection{Results of the study cases}
- The results about very well described structures (Table~\ref{Tab:01}) are consistent with the general benchmark. Biorseo used with Rna3Dmotifs and direct pattern matching predicts 2D structures as well as RNAsubopt. So does it when used with the RNA 3D Motif Atlas and JAR3D.
- BayesPairing has a low performance when used with Rna3Dmotifs and a surprising good performance with the Motif Atlas, which cannot be generalized given the larger benchmark data.
+ \begin{figure}[t]
+ \centerline{\includegraphics[width=\linewidth]{fig/pseudobase_zoom.jpg}} 
+ \caption{Boxplots of the best MCC found by the different predictors, on the dataset of 264 pseudoknotted-only RNAs from Pseudobase~(\citealp{van2000pseudobase}. Some computations are incomplete; Biokop succeeded for 201 RNAs, JAR3D+$f_{1A}$ for 249, and JAR3D+$f_{1B}$ for 248. \label{fig:pseudobase}}
+ \end{figure}
+ The results on the dataset of pseudoknotted-only structures are presented on Figure~\ref{fig:pseudobase}. Whatever the Biorseo variant, the median best MCC is over 0.8, which is a significant improvement compared to Biokop (0.73, with larger variance). However, the variants are very similar and again, no module source, pattern-matching method nor objective function distinguishes itself.
 
- The tRNA is an example of structure which is approximately correctly predicted by Biorseo, as well as other tools.
- The G riboswitch allows to show that Biorseo is less likely to insert false positive pseudo-knots than Biokop (which inserted one, resulting in a low score, structures not shown). 
+ \subsection{Results of the study cases}
+ The results about very well described structures are consistent with the general benchmark. Biorseo used with Rna3Dmotifs and direct pattern matching predicts 2D structures as well as RNAsubopt. So does it when used with the RNA 3D Motif Atlas and JAR3D. BayesPairing has a low performance when used with Rna3Dmotifs and a surprising good performance with the Motif Atlas, which cannot be generalized given the larger benchmark data.
+ The tRNA is an example of structure which is approximately correctly predicted by Biorseo, as well as other tools. The G riboswitch allows to show that Biorseo is less likely to insert false positive pseudo-knots than Biokop (which inserted several, resulting in a low score). 
 The telomerase pseudoknot is correctly predicted by all methods that support pseudoknots, including Biorseo. 
- 
+ Detailed results including structures, number of solutions and computation times are provided in appendix C. 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -227,15 +214,15 @@ The telomerase pseudoknot is correctly predicted by all methods that support pse
 
 \paragraph{Without pseudoknots, comparison to RNA-MoIP} ~  An interesting point is the improvement between RNA-MoIP and the bi-objective method which uses direct-pattern matching to spot insertion sites and $f_{1A}$ to score the insertions. This method only differs from RNA-MoIP because it is bi-objective. The improvement is small, but statistically significant with a Wilcoxon's signed rank test p-value of 0.024. Then, we can conclude that the Pareto approach really improves the structure prediction by itself. This result supports our hypothesis about RNA-MoIP breaking important basepairs. Unfortunately, this improvement in average is counterbalanced by an increase in variance.
 
- \paragraph{Regarding pseudoknots} ~ The support of pseudoknots allows a small increase in performance, because we return more solutions, some with pseudoknots, and some without. As we are looking at the maximum MCC here, the appropriate solution has been selected for each RNA. 
- Biorseo's reduced number of false positive pseudoknots compared to Biokop can be explained directly by the insertion of modules in loops: as we forbid explicitly any base-pair on nucleotides inside an inserted module's component, we prevent known loops to form pseudoknots. The drawback of this selectivity is the low ability to predict kissing hairpins (HHH type pseudoknots), precisely because they require loops to interact.
- However, pseudoknot prediction quality is difficult to assess with a metric like MCC, because a pseudoknot could be involved in only two or three base-pairs. Finding them or not does not alter much the MCC even if the structure is much more right or wrong from a biological point of view. Unfortunately, no automated assertion method exist yet.  A more accurate description of pseudoknot prediction performance would have required manual validation of every occurrence in every structure, which is too much work to achieve on such large datasets.
+ \paragraph{Regarding pseudoknots} ~ The support of pseudoknots allows an increase in performance, because we return more solutions, some with pseudoknots, and some without. As we are looking at the maximum MCC here, the appropriate solution has been selected for each RNA. 
+ Biorseo's reduced number of false positive pseudoknots compared to Biokop can be explained directly by the insertion of modules in loops: as we forbid explicitly any base-pair on nucleotides inside an inserted module's component, we prevent known loops to form pseudoknots. The drawback of this selectivity is the low ability to predict kissing hairpins (HHH type pseudoknots), precisely because they require loops to interact, which happened with our G riboswitch prediction. But for general purpose, given the performance difference between Figures \ref{fig:upgrades} and \ref{fig:pseudobase}, it seems that Biorseo is better on pseudoknotted RNAs.
+ However, pseudoknot prediction quality is difficult to assess with a metric like MCC, because a pseudoknot could be involved in only two or three base-pairs. Finding them or not does not alter much the MCC even if the structure is much more right or wrong from a biological point of view. Unfortunately, no automated verification method exist yet. A more accurate description of pseudoknot prediction performance would have required manual validation of every occurrence in every structure, which is too much work to achieve on such large datasets.
 
 \paragraph{On the objective functions} ~ 
- Regarding objective functions to include modules, the different criteria proposed seem to give comparable results at first sight regarding the average performance and the dispersion. However, an important difference between $f_{1A}$, $f_{1B}$ on one side, and $f_{1C}$, $f_{1D}$ on the other side, is about the computation time. As $f_{1A}$, $f_{1B}$ do not use a score to rank potential module insertion sites, every modules of the same size can be equally inserted. When the RNA presents several loops, the combinatorial possibilities grow fast with the number of modules in the dataset. Therefore, the number of undominated solutions can reach several hundreds or thousands even for short sequences. As explained in section \ref{sec:bench}, some of the computations never ended because of combinatorial issues with those objectives. Such large Pareto sets are not informative for our application, because they consist in very redundant secondary structures with different module references, which are counted only for one secondary structure solution at the end. On the other hand, $f_{1C}$ and $f_{1D}$ require the run of an additional tool (JAR3D or BayesPairing) to score the insertion sites. Given an RNA length, a compromise must be found .
+ Regarding objective functions to include modules, the different criteria proposed seem to give comparable results at first sight regarding the average performance and the dispersion. However, an important difference between $f_{1A}$, $f_{1B}$ on one side, and $f_{1C}$, $f_{1D}$ on the other side, is about the computation time. As $f_{1A}$, $f_{1B}$ do not use a score to rank potential module insertion sites, every modules of the same size can be equally inserted. When the RNA presents several loops, the combinatorial possibilities grow fast with the number of modules in the dataset. Therefore, the number of undominated solutions can reach several hundreds or thousands even for short sequences. As evocated in section \ref{sec:bench}, some of the computations never ended because of combinatorial issues with those objectives. Such large Pareto sets are not informative for our application, because they consist in very redundant secondary structures with different module references, which are counted only for one secondary structure solution at the end. On the other hand, $f_{1C}$ and $f_{1D}$ require the run of an additional tool (JAR3D or BayesPairing) to score the insertion sites. Given an RNA, a compromise must be found according to its length and amount of loops.
 
 \paragraph{The bias with JAR3D} ~ One should keep in mind that JAR3D takes as input the sequences of RNA loops to score modules against them. We detected the loops in the RNA sequence with RNAsubopt. This use of JAR3D is biased, because we score modules on sequence portions that we already know unlikely to form stems and likely to form loops. Therefore, the information brought by the insertion of a module is low. 
- Then, the enthusiasm about the bi-objective method with JAR3D and $f_{1C}$ (without pseudoknots) has to be moderate. It actually outputs almost the same secondary structures than RNAsubopt, discarding certain ones sometimes. As that method was the only interesting result without pseudoknots, we can argue that including known modules is not a general way to improve secondary structure prediction without pseudoknots. For every method, the average best MCC is below RNAsubopt, and the performance gain obtained on some structures is counterbalanced by the loss on approximately the same number of RNAs or more.
+ Then, the enthusiasm about the bi-objective method with JAR3D and $f_{1C}$ (without pseudoknots) has to be moderate. It actually outputs almost the same secondary structures than RNAsubopt, discarding certain ones sometimes. As that method was the only interesting result without pseudoknots, we can argue that including known modules is not a general way to improve secondary structure prediction without pseudoknots. For every method, the median best MCC is below RNAsubopt, and the performance gain obtained on some structures is counterbalanced by the loss on approximately the same number of RNAs or more.
 
 We also observe that using The RNA 3D Motif Atlas with JAR3D has a significantly different behavior than the other methods: first, it returns a very small number of solutions (1 or 2 most of the time). Then, the best structure is almost every-time the one that has the higher number of modules, while it is not the case for the other methods. This is a good point for method JAR3D-$f_{1C}$ which performs almost as well as RNAsubopt by returning only one or two structures. An explanation is that JAR3D is selective of a few module insertion sites, sites that were first perfectly predicted to be loops by RNAsubopt (as discussed earlier). This confirms the use of module information is not always relevant and that the energy criteria brings almost all the information. The modules only sometimes allow to reduce the number of solutions.
 
@@ -253,11 +240,13 @@ For that reason, we recommend three models : if the user does not expect pseudok
 We developed a general bi-objective method to benchmark different sources of RNA module models (the RNA 3D Motif Atlas and Rna3Dmotifs), different methods to place them in sequences (direct pattern matching, BayesPairing, and JAR3D), and different scoring functions. The bi-objective method uses the expected accuracy of the structure, and the previous scoring functions to select relevant secondary structures.
    
 The results show that no data source prevails. They also show that the use of module information is irrelevant to predict structures without pseudoknots. 
- The real interest would be when looking for potential pseudoknots, where several of our methods can improve the prediction performance (and computation times) compared to state-of-the-art tools.
+ The real interest would be when looking for potential pseudoknots, where several of our methods improve the prediction performance (and computation times) compared to state-of-the-art tools.
+ 
+ Some of our models over-perform RNA-MoIP, a previous attempt to predict better secondary structures using module information from Rna3Dmotifs and a linear combination of two objectives into a scoring function. Our simplest best-performing new method could be interpreted as an upgraded RNA-MoIP with a real bi-objective framework and a better module insertion objective, which predicts the base pairs and the module insertions in a row, preventing the insertion to break important base-pairs.
 
- Some of our models over-perform RNA-MoIP, a previous attempt to predict better secondary structures using module information from Rna3Dmotifs and a linear combination of two objectives into a scoring function. Our simplest best-performing new method could be interpreted as an upgraded RNA-MoIP with updated data (there is a 10-fold increase in the number of solved RNA crystal structures between 2008 and 2018) and a real bi-objective framework, which predicts the base pairs and the module insertions in a row, preventing the insertion to break important base-pairs.
+ All Biorseo variants are available as a web service or for download on the EvryRNA website.  
 
- Improvement perspectives now rely on the hope than newer databases like CaRNAval~(\citealp{reinharz2018mining}), containing more recent and more diverse module information, to really bring more information to assist the energy criteria.\\
+ Improvement perspectives now rely on the hope than newer databases like CaRNAval~(\citealp{reinharz2018mining}), containing more recent and more diverse module information (there is a 10-fold increase in the number of solved RNA crystal structures between the original Rna3Dmotifs dataset from 2008, and 2018), to really bring more information to assist the energy criteria.\\
 
 \bibliographystyle{natbib}
 %\bibliographystyle{achemnat}
--- a/doc/supplementary_material.tex
View file @73734e9
+++ b/doc/supplementary_material.tex
View file @73734e9
 \documentclass{article}
 \usepackage[utf8]{inputenc}
+ \usepackage{graphicx}
 \usepackage{amsmath}
 \usepackage{stmaryrd}   % llbracket, rrbracket
 \usepackage{siunitx}	% SI units
@@ -17,7 +18,7 @@ The constraints have been rewritten by us, but are inspired by works like IPknot
 \paragraph{Extended notations} ~ Here we repeat the definition of the variables that we already used in the article, and we use a few more, that also are defined:\\
 Let $n$ be the number of nucleotides in the query RNA sequence $s$.\\
 Let $M$ be the set of modules that could be inserted in $s$.\\
- Let $x$ be a module of $M$, $\|x\|$ be the number of distinct components of $x$, and $p(x)$ the associated score of insertion given by JAR3D for that motif inserted at a particular position.\\
+ Let $x$ be a module of $M$, $\|x\|$ be the number of distinct components of $x$, and $p(x)$ the associated score of insertion given by JAR3D or BayesPairing for that motif inserted at a particular position.\\
 Let $P_{x,i}$ be the position in $s$ where we can insert the $i$th component of module $x$.\\
 As the same module model can be inserted several times in $s$, several different $x$ modules in $M$ may refer to the same theoretical module, but inserted at different positions.\\
 Let $k_{x,i}$ be the size in nucleotides of that $i$th component of $x$.\\
@@ -114,4 +115,179 @@ We do it by adding iteratively, for every structure $s^*$ found, the following c
 \end{equation}
 
 It ensures that at least one of the decision variables differs from $s^*$.
+ 
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ \section{Average MCC of the method's variants}
+ Instead of looking at the best MCC to see if the true structure has been found in the Pareto set, one can look at the average MCC over the Pareto set.
+ 
+ We provide such results to satisfy the reader's curiosity, but this average is hard to interpret. 
+ The Pareto set is supposed to propose several solutions that could be several meta-stable state, but there is no reason for these states to be close one to another, nor to be close to the "true" structure that has been observed and saved in the database.
+ A possible interpretation is the average distance of the meta-stable states to the "true" structure, if and only if we assume the predictions are correct.
+ 
+ \hspace{-1cm}
+ \includegraphics[width=\textwidth]{fig/Benchmark_avg.jpg}
+ 
+ \hspace{-1cm}
+ \includegraphics[width=1.05\textwidth]{fig/pseudobase_avg.jpg}
+ 
+ (A) is the RNAstrand dataset for methods which do not support pseudoknots (computations succeeded for all methods for 291 RNAs), (B) is the same dataset but with pseudoknot support (294 RNAs), and (C) is the Pseudobase dataset of 264 RNAs.
+ 
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ \section{Study cases results}
+ Here we provide the structures and statistics of the 3 well known RNAs that we studied in detail.
+ 
+ \subsection{General statistics}
+ 
+ The first line in Table \ref{Tab:01} gives the results for E.\textit{coli} tRNA Gln  (PDB\_00376), the second line for the glycine riboswitch (PDB\_01023), and the third for the human telomerase's pseudoknot (PDB\_00857). We observe that the best structure is often the same accross the different objective functions $f_{1A}, f_{1B}, f_{1C}, f_{1D}$, but the rest of the set can be different in number of solutions and diversity.
+ 
+ \begin{table*}[h]
+ \caption{Best MCC results for study cases. Pseudoknots are allowed. \label{Tab:01}}
+ \vspace{5mm}
+ \begin{tabular}{@{}r|lll|@{}} & RNAsubopt & RNA-MoIP & BiokoP \\
+  & & & \\\hline
+ PDB\_00376 & 0.68 & 0.68 & 0.67 \\
+ PDB\_01023 & 0.86 & 0.86 & 0.59 \\
+ PDB\_00857 & 0.77 & 0.77 & 1.0 \\
+ \end{tabular}
+ \vspace{5mm}
+ \vfill
+ 
+ \begin{tabular}{@{}r|llll|@{}} &  Rna3Dmotifs & Rna3Dmotifs & RNA 3D Motif Atlas & RNA 3D Motif Atlas\\
+  & + Direct P.M. & + BayesPairing & + JAR3D & + BayesPairing \\\hline
+ PDB\_00376 &  0.72 (A,B) & 0.74 (B,C,D), 0.71 (A) & 0.74 (A,C,D), 0.72 (B) & 0.76 (\textit{all})\\
+ PDB\_01023 &  0.79 (A,B) & 0.29 (\textit{all}) & 0.82 (\textit{all}) & 0.82 (\textit{all})\\
+ PDB\_00857 &  0.97 (B), 0.77 (A) & 0.97 (\textit{all}) & 0.97 (\textit{all}), & 0.97 (\textit{all})\\
+ \end{tabular}
+ \end{table*}
+ 
+ Detailed results are given below for each RNA. The number of solutions and computation times are also reported. Note that these cases are small RNAs, resulting in both small number of solutions and small times. The times are the "Real" time spent, therefore you should use the same 16-thread CPU to reproduce them, because there are several multi-threaded parts in the process. They also are very dependant of the I/O delays. Especially with methods reading modules from disk, you may want to use a very fast storage device (e.g. NVMe SSD NAND storage) to increase the speed.
+ 
+ \newpage
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ \subsection{E. coli's Gln tRNA}
+ \paragraph{Sequence (FASTA format)} ~ 
+ 
+ \texttt{>>'CRYSTAL STRUCTURE OF A TIGHT-BINDING GLUTAMINE TRNA BOUND TO GLUTAMINE AMINOACYL TRNA SYNTHETASE ' (PDB 00376)\\
+ GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA}
+ 
+ \paragraph{Referenced "true" structure in RNA-Strand (PDB 00376)} ~ 
+ 
+ \texttt{((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....}
+ 
+ \paragraph{Best prediction results} ~ 
+ 
+ {\scriptsize
+ \begin{tabular}{rlccr}
+ Method & Best secondary structure & best MCC & N solutions & time (s)\\
+ \hline
+ True structure:	& \texttt{((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....} & & & \\
+ RNAsubopt:&	 \texttt{(((((((.(((....)))..(((.(((((.......)))))..)))((((.......))))))))))).....} &0.68 &4 & 0.01\\
+ Biokop	:&	 \texttt{[[[[[[((((...))))...(((.((((([[[....)))))....(((((...]]].)))))]]]]]].))).}& 0.67 &30& 10.3\\
+ RNA-MoIP	:&	 \texttt{((((((..((......))...((.(((((.......)))))..))..((.........))..)))))).....}& 0.67 &4& 0.01+5.0\\
+ Direct P.M.-A	:&	 \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &8& 7.7\\
+ Direct P.M.-B	:	& \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &11& 7.9\\
+ BayesPairing-A:&	 \texttt{(((((((((((....)))......(((((.......))))).)).((((.........)))))))))).....} &0.71 &7& 74+9.9\\
+ BayesPairing-B:&	 \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &8& 74+15.5\\
+ BayesPairing-C:	& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &9& 74+8.6\\
+ BayesPairing-D:	& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &10& 74+9.2\\
+ JAR3D-A	:	& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &3& 0.01+1.3+7.9\\
+ JAR3D-B	:	& \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &3& 0.01+1.3+8.9\\
+ JAR3D-C	:&	 \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &5& 0.01+1.3+7.7\\
+ JAR3D-D	:&	 \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &5& 0.01+1.3+7.9\\
+ BGSU-BPairing-A:	& \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &6& 61+8.7\\
+ BGSU-BPairing-B:	& \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &10& 61+12.1\\
+ BGSU-BPairing-C:&	 \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &4& 61+7.5\\
+ BGSU-BPairing-D:	& \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76& 4& 61+7.2\\
+ \end{tabular}}
+ 
+ \paragraph{Notes} ~
+ 
+ Note that BiokoP inserts a false-positive pseudoknot, while the Biorseo variants do not. If we look at our two recommended methods, here is an example of difference in the number of solutions : Biorseo+Rna3Dmotifs+Direct Pattern Matching + function B return 11 solutions, while Biorseo+The RNA 3D Motif Atlas+JAR3D+function B returns only 3, with the same best MCC of 0.72.
+ 
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ \subsection{G Riboswitch}
+ 
+ \paragraph{Sequence (FASTA format)} ~ 
+ 
+ \texttt{>  'GUANINE RIBOSWITCH U22C, A52G MUTANT BOUND TO HYPOXANTHINE ' (PDB 01023)\\
+ GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa}
+ 
+ \paragraph{Referenced "true" structure in RNA-Strand (PDB 01023)} ~ 
+ 
+ \texttt{(((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).}
+ 
+ \paragraph{Best prediction results} ~ 
+ 
+ {\scriptsize
+ \begin{tabular}{rlccr}
+ Method & Best secondary structure & best MCC & N solutions & time (s)\\
+ \hline
+ True structure:	 & \texttt{(((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).} & & & \\
+ RNAsubopt:	 & \texttt{(((((((.....(((((.......)))))..........((((((.......))))))..))))))).} & 0.86 & 3 & 0.01 \\
+ Biokop	:	 & \texttt{(((((([[[.[[(((((]][[[[[))))).(((...[[[(((]]]]]]]]..]]])))))))))))).} & 0.59 & 4 & 58.2 \\
+ RNA-MoIP	:	 & \texttt{(((((((.....(((((.......)))))..........(((((.........)))))..))))))).} & 0.84 & 3 & 0.01+4.1\\
+ Direct P.M.-A	:	 & \texttt{(((((((.....(((((.......)))))..((....))(((((.........)))))..))))))).} & 0.79 & 15 & 4.3 \\
+ Direct P.M.-B	:	 & \texttt{((((.((.....(((((.......))))).((...))..((((((.......))))))..)).)))).} & 0.79 & 18 & 9.0 \\
+ BayesPairing-A:	 & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 4 & 53+8.3\\
+ BayesPairing-B:	 & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 4 & 53+8.4\\
+ BayesPairing-C:	 & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 3 & 53+5.8\\
+ BayesPairing-D:	 & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 3 & 53+5.6\\
+ JAR3D-A	:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 4 & 0.01+1.2+30.3\\
+ JAR3D-B	:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 5 & 0.01+1.2+23.8\\
+ JAR3D-C	:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 2 & 0.01+1.2+4.7\\
+ JAR3D-D	:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 2 & 0.01+1.2+4.6\\
+ BGSU-BPairing-A:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+6.2\\
+ BGSU-BPairing-B:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+8.4\\
+ BGSU-BPairing-C:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+6.7\\
+ BGSU-BPairing-D:	 & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+8.1\\
+ \end{tabular}}
+ 
+ \paragraph{Notes} ~
+ 
+ On one side, here again, BiokoP predicts too many knots. The RNA only contains one kissing-hairpin HHH pseudoknot. On the other side, as expected, this HHH knot is not detected by Biorseo.
+ 
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ \subsection{Human telomerase's RNA pseudoknot}
+ 
+ \paragraph{Sequence (FASTA format)} ~ 
+ 
+ \texttt{>  'SOLUTION STRUCTURE OF THE P2B-P3 PSEUDOKNOT FROM HUMAN TELOMERASE RNA ' (PDB 00857)\\
+ GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA}
+ 
+ \paragraph{Referenced "true" structure in RNA-Strand (PDB 00857)} ~ 
+ 
+ \texttt{[[[[[[........(((((((((]]]]]]........))))))))).}
+ 
+ \paragraph{Best prediction results} ~ 
+ 
+ {\scriptsize
+ \begin{tabular}{rlccr}
+ Method & Best secondary structure & best MCC & N solutions & time (s)\\
+ \hline
+ True structure:	 & \texttt{[[[[[[........(((((((((]]]]]]........)))))))))} & & & \\
+ RNAsubopt:	 & \texttt{..............(((((((((..............))))))))).} &  0.77 & 3 & 0.06\\
+ Biokop	:	 & \texttt{[[[[[[........(((((((((]]]]]]........))))))))).} &  1.00 & 1 & 4.7\\
+ RNA-MoIP	:	 & \texttt{..............(((((((((..............))))))))).} &  0.77 & 3 & 0.06+3.3\\
+ Direct P.M.-A	:	 & \texttt{..............(((((((((..............))))))))).} &  0.77 & 3 & 0.8\\
+ Direct P.M.-B	:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 7 & 0.7\\
+ BayesPairing-A:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 71+1.0\\
+ BayesPairing-B:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 71+0.6\\
+ BayesPairing-C:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 71+0.6\\
+ BayesPairing-D:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 71+0.6\\
+ JAR3D-A	:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 0.06+1.3+0.8\\
+ JAR3D-B	:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 0.06+1.3+0.6\\
+ JAR3D-C	:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 0.06+1.3+0.6\\
+ JAR3D-D	:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 0.06+1.3+0.6\\
+ BGSU-BPairing-A:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 57.7+0.5\\
+ BGSU-BPairing-B:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 &  57.7+0.5\\
+ BGSU-BPairing-C:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 57.7+0.5\\
+ BGSU-BPairing-D:	 & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} &  0.97 & 2 & 57.7+0.5\\
+ \end{tabular}}
+ 
+ 
+ \paragraph{Notes} ~
+ 
+ The methods which support pseudoknots are able to predict it correctly.
 \end{document}
\ No newline at end of file
--- a/test_on_RNAstrand.py
View file @73734e9
+++ b/test_on_RNAstrand.py
View file @73734e9
@@ -315,7 +315,7 @@ def launch_BayesPairing(module_type, seq_, header_, basename):
     rna.close()
 
 def launch_RNAMoIP_worker(x):
-     RNAMoIP = "../RNAMoIP/RNAMoIP.py"
+     RNAMoIP = biorseoDir + "/../RNAMoIP/Src/RNAMoIP.py"
     logfile = open("log_of_the_run.sh", 'a')
     logfile.write(" ".join(["gurobi.sh", RNAMoIP, "-s", '"' +x[1]+'"', "-ss", '"'+x[0].strip()+'"', "-d", descfolder]))
     logfile.write("\n")
@@ -369,14 +369,6 @@ def launch_RNAMoIP(seq_, header_, basename):
         rna.write(p+'\t'+str(n)+'\t'+str(s)+'\n')
     rna.close()
 
- def launch_pKiss(seq_, header_, basename):
-     json = "{\"pkiss_input_rna_sequences\":\">%s\r\n%s\",\"paramset\":{\"pkiss_parameter_absoluteDeviation\":\"0.5\",\"pkiss_parameter_maxKnotSize\":\"3.0\",\"pkiss_parameter_windowSize\":\"1.0\",\"pkiss_parameter_param\":\"rna_andronescu2007\"}}" %(header_, seq_)
-     cmd = "curl -X POST -d @[[%s]] http://bibiserv2.cebitec.uni-bielefeld.de:80/rest/pkiss/pkiss_function_subopt/request -H \"Content-Type: application/json\"" % json
-     logfile = open("log_of_the_run.sh", 'a')
-     logfile.write(cmd+"\n")
-     logfile.close()
-     print(cmd)
- 
 def mattews_corr_coeff(tp, tn, fp, fn):
     if (tp+fp == 0):
         print("We have an issue : no positives detected ! (linear structure)")