Louis BECQUEY
...@@ -19,6 +19,7 @@ bin/* ...@@ -19,6 +19,7 @@ bin/*
19 19
20 # results 20 # results
21 results/* 21 results/*
22 +benchmark_results/*
22 log_of_the_run.sh 23 log_of_the_run.sh
23 logBadDesc.txt 24 logBadDesc.txt
24 gurobi.log 25 gurobi.log
......
...@@ -9,7 +9,7 @@ Contact : louis.becquey@univ-evry.fr ...@@ -9,7 +9,7 @@ Contact : louis.becquey@univ-evry.fr
9 1/ How it works 9 1/ How it works
10 =================================== 10 ===================================
11 INPUT: 11 INPUT:
12 -- An RNA sequence (tested with sequences ~100 bases) 12 +- An RNA sequence (with 16 GB of RAM you can go up to ~230 bases)
13 13
14 THEN 14 THEN
15 - **Pattern-matching step** : Find all possible occurrences of known RNAmodules in the query sequence, by finding subsequences of the querythat score well with the probabilistic models of the modules (like JAR3D, or BayesPairing) 15 - **Pattern-matching step** : Find all possible occurrences of known RNAmodules in the query sequence, by finding subsequences of the querythat score well with the probabilistic models of the modules (like JAR3D, or BayesPairing)
......
...@@ -21,19 +21,18 @@ bypdir = "" ...@@ -21,19 +21,18 @@ bypdir = ""
21 biorseoDir = "." 21 biorseoDir = "."
22 exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec')) 22 exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec'))
23 runDir = path.dirname(path.realpath(__file__)) 23 runDir = path.dirname(path.realpath(__file__))
24 -tempDir = biorseoDir + "/temp/" 24 +modulespath = biorseoDir + "/data/modules"
25 -HLmotifDir = biorseoDir + "/data/modules/BGSU/HL/3.2/lib" 25 +HLmotifDir = modulespath + "/BGSU/HL/3.2/lib"
26 -ILmotifDir = biorseoDir + "/data/modules/BGSU/IL/3.2/lib" 26 +ILmotifDir = modulespath + "/BGSU/IL/3.2/lib"
27 -descfolder = biorseoDir + "/data/modules/DESC" 27 +descfolder = modulespath + "/DESC"
28 28
29 # Parse options 29 # Parse options
30 try: 30 try:
31 - opts, args = getopt.getopt(sys.argv[1:], "hi:o:", ["rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func="]) 31 + opts, args = getopt.getopt(sys.argv[1:], "bc:f:hi:jl:no:pt:v", ["verbose", "rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func=","help","version","seq=","modules-path=", "first-objective=","output=","theta=","interrupt-limit="])
32 -except getopt.GetoptError: 32 +except getopt.GetoptError as err:
33 - print("Please provide arguments !") 33 + print(err)
34 sys.exit(2) 34 sys.exit(2)
35 35
36 -
37 m = Manager() 36 m = Manager()
38 running_stats = m.list() 37 running_stats = m.list()
39 running_stats.append(0) # n_launched 38 running_stats.append(0) # n_launched
...@@ -335,38 +334,75 @@ class BiorseoInstance: ...@@ -335,38 +334,75 @@ class BiorseoInstance:
335 self.jobcount = 0 334 self.jobcount = 0
336 self.joblist = [] 335 self.joblist = []
337 self.mode = 0 # default is single sequence mode 336 self.mode = 0 # default is single sequence mode
337 + self.forward_options = []
338 338
339 for opt, arg in opts: 339 for opt, arg in opts:
340 - if opt == "-h": 340 + if opt == "-h" or opt == "--help":
341 - print("biorseo.py -i myRNA.fa -o myRNA.rawB --rna3dmotifs --patternmatch --func B") 341 + print( "Biorseo, Bi-Objective RNA Structure Efficient Optimizer\n"
342 - print("biorseo.py -i myRNA.fa -o myRNA.jar3dB --3dmotifatlas --jar3d --func B") 342 + "Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules.\n"
343 - print("biorseo.py -i myRNA.fa -o myRNA.bgsubypD --3dmotifatlas --bayespairing --func D") 343 + "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2019\n\n")
344 + print("Usage:\tYou must provide:\n\t1) a FASTA input file with -i,\n\t2) a module type with --rna3dmotifs or --3dmotifatlas"
345 + "\n\t3) one module placement method in { --patternmatch, --jar3d, --bayespairing }\n\t")
346 + print("Options:")
347 + print("-h [ --help ]\t\tPrint this help message")
348 + print("--version\t\t\tPrint the program version")
349 + print("-i [ --seq ]\t\tFASTA file with the query RNA sequence")
350 + print("-p [ --patternmatch ]\t\tUse regular expressions to place modules in the sequence")
351 + print("-j [ --jar3d ]\t\tUse JAR3D to place modules in the sequence (requires --3dmotifatlas)")
352 + print("-b [ --bayespairing ]\t\tUse BayesPairing to place modules in the sequence")
353 + print("-o [ --output ]\t\tFolder where to output files")
354 + print("-f [ --func ]\t\t(A, B, C or D, default is B)"
355 + "\t\t\t\tObjective function to score module insertions: (A) insert big modules (B) insert light, high-order modules"
356 + "\t\t\t\t(c) insert modules which score well with the sequence (D) insert light, high-order modules which score well with the sequence.")
357 + "\t\t\t\tC and D require cannot be used with --patternmatch.")
358 +
359 + print("biorseo.py -i myRNA.fa -o myResultsFolder/ --rna3dmotifs --patternmatch --func B")
360 + print("biorseo.py -i myRNA.fa -o myResultsFolder/ --3dmotifatlas --jar3d --func B")
361 + print("biorseo.py -i myRNA.fa --3dmotifatlas --bayespairing --func D")
344 sys.exit() 362 sys.exit()
345 - elif opt == "-i": 363 + elif opt == "-i" or opt == "--seq":
346 self.inputfile = arg 364 self.inputfile = arg
347 - elif opt == "-o": 365 + elif opt == "-o" or opt == "--output":
348 self.outputf = arg # output file or folder... 366 self.outputf = arg # output file or folder...
349 if self.outputf[1] != '/': 367 if self.outputf[1] != '/':
350 self.outputf = getcwd() + '/' + self.outputf 368 self.outputf = getcwd() + '/' + self.outputf
351 if self.outputf[-1] != '/': 369 if self.outputf[-1] != '/':
352 self.outputf = self.outputf + '/' 370 self.outputf = self.outputf + '/'
353 - elif opt == "--func": 371 + elif opt == "-f" or opt == "--func":
354 if arg in ['A', 'B', 'C', 'D']: 372 if arg in ['A', 'B', 'C', 'D']:
355 self.func = arg 373 self.func = arg
356 else: 374 else:
357 raise "Unknown scoring function " + arg 375 raise "Unknown scoring function " + arg
358 - elif opt == "--patternmatch": 376 + elif opt == "-p" or opt == "--patternmatch":
359 self.type = "dpm" 377 self.type = "dpm"
360 - elif opt == "--jar3d": 378 + elif opt == "-j" or opt == "--jar3d":
361 self.type = "jar3d" 379 self.type = "jar3d"
362 - elif opt == "--bayespairing": 380 + elif opt == "-b" or opt == "--bayespairing":
363 self.type = "byp" 381 self.type = "byp"
364 elif opt == "--rna3dmotifs": 382 elif opt == "--rna3dmotifs":
365 self.modules = "desc" 383 self.modules = "desc"
366 elif opt == "--3dmotifatlas": 384 elif opt == "--3dmotifatlas":
367 self.modules = "bgsu" 385 self.modules = "bgsu"
368 - else: 386 + elif opt == "--modulespath":
369 - raise "Unknown option " + opt 387 + HLmotifDir = arg + "/HL/3.2/lib"
388 + ILmotifDir = arg + "/IL/3.2/lib"
389 + descfolder = arg
390 + elif opt == "--version":
391 + subprocess.call([biorseoDir+"/bin/biorseo", "--version"])
392 + exit(0)
393 + elif opt == "-l" or opt == "--interrupt-limit":
394 + self.forward_options.append("-l")
395 + self.forward_options.append(arg)
396 + elif opt == "-v" or opt == "--verbose":
397 + self.forward_options.append("-v")
398 + elif opt == "-n" or opt == "--disable-pseudoknots":
399 + self.forward_options.append("-n")
400 + elif opt == "-t" or opt == "--theta":
401 + self.forward_options.append("-t")
402 + self.forward_options.append(arg)
403 + elif opt == "-c" or opt == "--first-objective":
404 + self.forward_options.append("-c")
405 + self.forward_options.append(arg)
370 406
371 print("saving files to", self.outputf) 407 print("saving files to", self.outputf)
372 # create jobs 408 # create jobs
...@@ -793,7 +829,8 @@ class BiorseoInstance: ...@@ -793,7 +829,8 @@ class BiorseoInstance:
793 command = [executable, "-s", fastafile ] 829 command = [executable, "-s", fastafile ]
794 if method_type: 830 if method_type:
795 command += [ method_type, csv ] 831 command += [ method_type, csv ]
796 - command += [ "-o", self.outputf + instance.header + ext + self.func, "--type", self.func ] 832 + command += [ "-o", self.outputf + instance.header + ext + self.func, "--function", self.func ]
833 + command += self.forward_options
797 self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3)) 834 self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3))
798 835
799 836
......
...@@ -21,6 +21,7 @@ char MOIP::obj_function_nbr_ = 'A'; ...@@ -21,6 +21,7 @@ char MOIP::obj_function_nbr_ = 'A';
21 uint MOIP::obj_to_solve_ = 1; 21 uint MOIP::obj_to_solve_ = 1;
22 double MOIP::precision_ = 1e-5; 22 double MOIP::precision_ = 1e-5;
23 bool MOIP::allow_pk_ = true; 23 bool MOIP::allow_pk_ = true;
24 +uint MOIP::max_sol_nbr_ = 500;
24 25
25 unsigned getNumConstraints(IloModel& m) 26 unsigned getNumConstraints(IloModel& m)
26 { 27 {
...@@ -499,8 +500,8 @@ void MOIP::add_solution(const SecondaryStructure& s) ...@@ -499,8 +500,8 @@ void MOIP::add_solution(const SecondaryStructure& s)
499 { 500 {
500 if (verbose_) cout << "\t>adding structure to Pareto set :\t" << s.to_string() << endl; 501 if (verbose_) cout << "\t>adding structure to Pareto set :\t" << s.to_string() << endl;
501 pareto_.push_back(s); 502 pareto_.push_back(s);
502 - if (pareto_.size() > 500) { 503 + if (pareto_.size() > max_sol_nbr_) {
503 - cerr << "\033[31m Quitting because combinatorial issues (>500 solutions in Pareto set). \033[0m" << endl; 504 + cerr << "\033[31m Quitting because combinatorial issues (>" << max_sol_nbr_ << " solutions in Pareto set). \033[0m" << endl;
504 exit(1); 505 exit(1);
505 } 506 }
506 } 507 }
......
...@@ -30,7 +30,8 @@ class MOIP ...@@ -30,7 +30,8 @@ class MOIP
30 static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ? 30 static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
31 static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =( 31 static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
32 static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true) 32 static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true)
33 - 33 + static uint max_sol_nbr_; // Number of solutions to accept in the Pareto set before we give up the computation
34 +
34 private: 35 private:
35 bool is_undominated_yet(const SecondaryStructure& s); 36 bool is_undominated_yet(const SecondaryStructure& s);
36 void define_problem_constraints(void); 37 void define_problem_constraints(void);
......
...@@ -73,14 +73,15 @@ int main(int argc, char* argv[]) ...@@ -73,14 +73,15 @@ int main(int argc, char* argv[])
73 ("version", "Print the program version") 73 ("version", "Print the program version")
74 ("seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence") 74 ("seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")
75 ("descfolder,d", po::value<string>(&motifs_path_name), "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program") 75 ("descfolder,d", po::value<string>(&motifs_path_name), "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")
76 - ("jar3dcsv", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py") 76 + ("jar3dcsv,j", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
77 - ("bayespaircsv", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py") 77 + ("bayespaircsv,b", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
78 ("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm") 78 ("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")
79 ("output,o", po::value<string>(&outputName), "A file to summarize the computation results") 79 ("output,o", po::value<string>(&outputName), "A file to summarize the computation results")
80 ("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing") 80 ("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing")
81 - ("type,f", po::value<char>(&obj_function_nbr)->default_value('A'), "What objective function to use to include motifs: square of motif size in nucleotides like " 81 + ("function,f", po::value<char>(&obj_function_nbr)->default_value('B'), "What objective function to use to include motifs: square of motif size in nucleotides like "
82 - "RNA-MoIP (A), motif size + number of components (B), site score (C), motif size + site score + number of components (D)") 82 + "RNA-MoIP (A), light motif size + high number of components (B), site score (C), light motif size + site score + high number of components (D)")
83 ("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots") 83 ("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots")
84 + ("limit,l", po::value<unsigned int>(&MOIP::max_sol_nbr_)->default_value(500), "Intermediate number of solutions in the Pareto set above which we give up the calculation.")
84 ("verbose,v", "Print what is happening to stdout"); 85 ("verbose,v", "Print what is happening to stdout");
85 po::variables_map vm; 86 po::variables_map vm;
86 po::store(po::parse_command_line(argc, argv, desc), vm); 87 po::store(po::parse_command_line(argc, argv, desc), vm);
...@@ -99,7 +100,7 @@ int main(int argc, char* argv[]) ...@@ -99,7 +100,7 @@ int main(int argc, char* argv[])
99 return EXIT_SUCCESS; 100 return EXIT_SUCCESS;
100 } 101 }
101 if (vm.count("version")) { 102 if (vm.count("version")) {
102 - cout << "Biorseo v1.0, May 2019" << endl; 103 + cout << "Biorseo v1.01, June 2019" << endl;
103 return EXIT_SUCCESS; 104 return EXIT_SUCCESS;
104 } 105 }
105 if (vm.count("verbose")) verbose = true; 106 if (vm.count("verbose")) verbose = true;
...@@ -112,7 +113,7 @@ int main(int argc, char* argv[]) ...@@ -112,7 +113,7 @@ int main(int argc, char* argv[])
112 return EXIT_FAILURE; 113 return EXIT_FAILURE;
113 } 114 }
114 if (vm.count("-d") and (obj_function_nbr == 'C' or obj_function_nbr == 'D')) { 115 if (vm.count("-d") and (obj_function_nbr == 'C' or obj_function_nbr == 'D')) {
115 - cerr << "\033[31mYou must provide --jar3dcsv or --bayespaircsv to use --type C or --type D.\033[0m See " 116 + cerr << "\033[31mYou must provide --jar3dcsv or --bayespaircsv to use --function C or --function D.\033[0m See "
116 "--help for more " 117 "--help for more "
117 "information." 118 "information."
118 << endl; 119 << endl;
......
...@@ -6,4 +6,4 @@ GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa ...@@ -6,4 +6,4 @@ GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
6 (((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))). 6 (((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).
7 >__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857) 7 >__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857)
8 GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA 8 GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
9 -[[[[[[........(((((((((]]]]]]........))))))))).
...\ No newline at end of file ...\ No newline at end of file
9 +[[[[[[........(((((((((]]]]]]........))))))))).
......
1 ->>__'CRYSTAL_STRUCTURE_ANALYSIS_OF_A_TRNA-NEOMYCIN_COMPLEX_'_(PDB_00095)
2 -GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGACUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA
3 -(((((((..((((........))))(((((........)))))....(((((.......))))))))))))....
4 ->>__'YEAST_INITIATOR_TRNA_'_(PDB_00229)
5 -AGCGCCGUGGCGCAGUGGAAGCGCGCAGGGCUCAUAACCCUGAUGUCCUCGGAUCGAAACCGAGCGGCGCUACCA
6 -(((((((..((((.......))))(((((((.....)))))))....(((((.......))))))))))))....
7 ->>__'THE_CRYSTAL_STRUCTURE_OF_PHENYLALANYL-TRNA_SYNTHETASE_FROM_THERMUS_THERMOPHILUS_COMPLEXED_WITH_COGNATE_TRNAPHE_'_(PDB_00362)
8 -GCCGAGGUAGCUCAGUUGGUAGAGCAUGCGACUGAAAAUCGCAGUGUCCGCGGUUCGAUUCCGCGCCUCGGCACCA
9 -(((((((..((((........))))((((((.......))))))....(((((.......))))))))))))....
10 ->>__'CRYSTAL_STRUCTURE_OF_GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_A_TRNA-GLN_MUTANT_AND_AN_ACTIVE-SITE_INHIBITOR_'_(PDB_00373)
11 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAGCGAGGUUCGAAUCCUCGUACCCCAGCCA
12 -((((((..((((.......))))((((((((...))))))))..(((((.......))))))))))).....
13 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_A_TRNA_MUTANT_AND_AN_ACTIVE_SITE_INHIBITOR_'_(PDB_00374)
14 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAAGCGAGGUUCGAAUCCUCGUACCCCAGCCA
15 -((((((..((((.......))))((((((((...))))))))...(((((.......))))))))))).....
16 ->>__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376)
17 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
18 -((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....
19 ->>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00402)
20 -GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
21 -(((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
22 ->>__'STRUCTURAL_BASIS_OF_ANTICODON_LOOP_RECOGNITION_BY_GLUTAMINYL-TRNA_SYNTHETASE_'_(PDB_00425)
23 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
24 -((((((..(((.........)))((((((((...))))))))....(((((.......))))))))))).....
25 ->>__'STRUCTURAL_BASIS_FOR_TRANSFER_RNA_AMINOACEYLATION_BY_ESCHERICHIA_COLI_GLUTAMINYL-TRNA_SYNTHETASE_'_(PDB_00426)
26 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
27 -((((((..(((.........))).(((((((...))))))).....(((((.......))))))))))).....
28 ->>__'CRYSTAL_STRUCTURE_OF_ARCHAEAL_TYROSYL-TRNA_SYNTHETASE_COMPLEXED_WITH_TRNA(TYR)_AND_L-TYROSINE_'_(PDB_00474)
29 -CCGGCGGUAGUUCAGCCUGGUAGAACGGCGGACUGUAGAUCCGCAUGUCGCUGGUUCAAAUCCGGCCCGCCGGA
30 -(((((((..((((.........))))(((((((.....)))))))....(((((.......)))))))))))).
31 ->>__'CRYSTAL_STRUCTURE_OF_L-GLUTAMINE_AND_AMPCPP_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00620)
32 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
33 -((((((..((((.......))))(((((((.....)))))))....(((((.......))))))))))).....
34 ->>__'CRYSTAL_STRUCTURE_OF_L-GLUTAMATE_AND_AMPCPP_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00621)
35 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
36 -((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
37 ->>__'CRYSTAL_STRUCTURE_OF_THE_TRNA_DOMAIN_OF_TRANSFER-MESSENGER_RNA_IN_COMPLEX_WITH_SMPB_'_(PDB_00637)
38 -GAUUCGACGGGGACUUCGGUCCUCGGACGCGGGUUCGAUUCCCGCUCGACGGGGACUUCGGUCCUCGGA
39 -......((((((((....))))))))..(((((.......)))))...((((((((....)))))))).
40 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_D235N_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00669)
41 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
42 -((((((..(((.........)))((((((((...))))))))....(((((.......))))))))))).....
43 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_D235G_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00670)
44 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
45 -((((((..(((.........)))(((((((.....)))))))....(((((.......))))))))))).....
46 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_I129T_COMPLEXED_WITH_GLUTAMINE_TRANSFER_RNA_'_(PDB_00671)
47 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
48 -((((((..(((.........)))((((((.......))))))....(((((.......))))))))))).....
49 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_WITH_TRNA_AND_AN_AMINO_ACID_ANALOG_'_(PDB_00672)
50 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
51 -((((((..((((.......))))((((((((...))))))))....(((((.......))))))))))).....
52 ->>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00673)
53 -GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
54 -(((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
55 ->>__'INSIGHTS_INTO_EDITING_FROM_AN_ILE-TRNA_SYNTHETASE_STRUCTURE_WITH_TRNA(ILE)_AND_MUPIROCIN_'_(PDB_00674)
56 -GGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCCAC
57 -(((((((..((((.........))))(((((((.....)))))))....(((((.......))))))))))))..
58 ->>__'CRYSTAL_STRUCTURE_OF_CYSTEINYL-TRNA_SYNTHETASE_BINARY_COMPLEX_WITH_TRNACYS_'_(PDB_00742)
59 -GGCGCGUUAACAAAGCGGUUAUGUAGCGGAUUGCAAAUCCGUCUAGUCCGGUUCGACUCCGGAACGCGCCUCCA
60 -(((((((..(((.........))).((((((.....))))))....(((((.......))))))))))))....
61 ->>__'CRYSTAL_STRUCTURE_OF_TRNA_NUCLEOTIDYLTRANSFERASE_COMPLEXED_WITH_A_PRIMER_TRNA_AND_AN_INCOMING_ATP_ANALOG_'_(PDB_00767)
62 -GGCCAGGGGCGGUUCGAUUCCGCCCCUGGCCGGCCAGGGGCGGUUCGAUUCCGCCCCUGGCCACCAA
63 -(((((((.(((.........))).)))))))(((((((.(((.........))).))))))).....
64 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_COMPLEXED_TO_GLUTAMINE_AND_2\'DEOXY_A76_GLUTAMINE_TRNA_'_(PDB_00901)
65 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
66 -((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
67 ->>__'IF2,_IF1,_AND_TRNA_FITTED_TO_CRYO-EM_DATA_OF_E._COLI_70S_INITIATION_COMPLEX_'_(PDB_00903)
68 -GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA
69 -(((((((..((((........))))((((((.......))))))....(((((.......))))))))))))....
70 ->>__'STRUCTURE_OF_HUMAN_TRYPTOPHANYL-TRNA_SYNTHETASE_IN_COMPLEX_WITH_TRNA(TRP)_'_(PDB_00926)
71 -GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCA
72 -((.((((..(((((.....)))))...((((.....)))).......(((((.......))))))))).)).
73 ->>__'CRYSTAL_STRUCTURE_OF_STAPHYLOCOCCUS_AUREUS_TRNA_ADENOSINE_DEAMINASE,_TADA,_IN_COMPLEX_WITH_RNA_'_(PDB_00943)
74 -UUGACUACGGAUCAAUUGACUACGGAUCAAGACUACGGUUUGACUACGGAUCAA
75 -(((((.....)))))(((((.....))))).........(((((.....)))))
76 ->>__'COCRYSTAL_STRUCTURE_OF_AN_RNA_SULFURATION_ENZYME_MNMA_AND_TRNA-GLU_IN_THE_PRE-REACTION_STATE_'_(PDB_00999)
77 -GUCCCCUUCGUCUAGAGGCCCAGGACACCGCCCUUUCACGGCGGUAACAGGGGUUCGAAUCCCCUAGGGG
78 -..((.....(((((.......)))))(((((((.....)))))))....((((.......))))...)).
79 ->>__'STRUCTURE_OF_HUMAN_TRYPTOPHANYL-TRNA_SYNTHETASE_IN_COMPLEX_WITH_TRNA(TRP)_'_(PDB_01002)
80 -GACCUCGUGGCGCAAUGGUAGCGCGUCUGACUCCAGAUCAGAAGGUUGCGUGUUCGAAUCACGUCGGGGUCACCA
81 -(((((((....(((.....)))...((((((.....)))))).....(((((.......))))))))))))....
82 ->>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_COMPLEXED_WITH_TRNACYS_AND_O-PHOSPHOSERINE_'_(PDB_01009)
83 -GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUGCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
84 -((((((([[(((]].......)))..((((.......))))......(((((.......))))))))))))
85 ->>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_COMPLEXED_WITH_TRNACYS_'_(PDB_01010)
86 -GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUGCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
87 -..((((([[(((]].......)))..(((((.....)))))......(((((.......))))))))))..
88 ->>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_E418N/E420N_MUTANT_COMPLEXED_WITH_TRNAOPAL_AND_O-PHOSPHOSERINE_("OPAL_COMPLEX")_'_(PDB_01011)
89 -GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUUCAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
90 -(((((((..(((.........))).(((((.......))))).....(((((.......))))))))))))
91 ->>__'CRYSTAL_STRUCTURE_OF_ARCHAEOGLOBUS_FULGIDUS_O-PHOSPHOSERYL-_TRNA_SYNTHETASE_E418N/E420N_MUTANT_COMPLEXED_WITH_TRNAAMBER_AND_O-PHOSPHOSERINE_("AMBER_COMPLEX")_'_(PDB_01012)
92 -GCCAGGGUGGCAGAGGGGCUUUGCGGCGGACUCUAGAUCCGCUUUACCCCGGUUCGAAUCCGGGCCCUGGC
93 -(((((((..(((.........))).(((((.......))))).....(((((.......))))))))))))
94 ->>__'CRYSTAL_STRUCTURE_OF_RNASE_Z/TRNA(THR)_COMPLEX_'_(PDB_01053)
95 -GCUUCCAUAGCUCAGCAGGUAGAGCGUCAGCGGUUCGAGCCCGCUUGGAAGCU
96 -(((((((..((((........))))...(((((.......)))))))))))).
97 ->>__'PHENYLALANYL-TRNA_SYNTHETASE_FROM_THERMUS_THERMOPHILUS_COMPLEXED_WITH_TRNA_AND_A_PHENYLALANYL-ADENYLATE_ANALOG_'_(PDB_01134)
98 -GCCGAGGUAGCUCAGUUGGUAGAGCAUGCGACUGAAAAUCGCAGUGUCCGCGGUUCGAUUCCGCGCCUCGGCACCA
99 -(((((((....((........))...((((((.....)))))).....((((.........)))))))))))....
100 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_C229R_WITH_BOUND_ANALOG_5\'-O-[N-(L-GLUTAMINYL)-SULFAMOYL]ADENOSINE_'_(PDB_01258)
101 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
102 -((((((..((((.......)))).(((((((...))))))).....(((((.......))))))))))).....
103 ->>__'GLUTAMINYL-TRNA_SYNTHETASE_MUTANT_C229R_WITH_BOUND_ANALOG_5\'-O-[N-(L-GLUTAMYL)-SULFAMOYL]ADENOSINE_'_(PDB_01259)
104 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGCAUUCCGAGGUUCGAAUCCUCGUACCCCAGCCA
105 -((((((..((((.......))))((((((((...))))))))....(((((.......))))))))))).....

142 KB | W: | H:

188 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin

114 KB | W: | H:

125 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
...@@ -23,9 +23,9 @@ ...@@ -23,9 +23,9 @@
23 23
24 \abstract{\textbf{Motivation:} RNA loops have now been modelled and clustered from solved 3D structures into ordered collections of recurrent non-canonical interactions called "RNA modules", available in databases. This work explores what information from such modules can be used to improve secondary structure prediction.\\ 24 \abstract{\textbf{Motivation:} RNA loops have now been modelled and clustered from solved 3D structures into ordered collections of recurrent non-canonical interactions called "RNA modules", available in databases. This work explores what information from such modules can be used to improve secondary structure prediction.\\
25 \textbf{Results:} We propose a Pareto-based method for predicting RNA secondary structures by minimizing a bi-objective both energy-based and knowledge-based potential. The tool, called \textsc{Biorseo}, outputs the secondary structures from the Pareto set. We use it to compare several approaches to predict secondary structures using inserted RNA modules information: two different module data sources, Rna3Dmotifs and The RNA 3D Motif Atlas, and different ways to score the module insertions are compared: module size, module complexity, or module probability according to models like JAR3D and BayesPairing. We benchmark them against 344 known secondary structures. Some of the tested methods present a good performance, especially on structures containing pseudoknots. They are compared to state of the art tools for secondary structure prediction.\\ 25 \textbf{Results:} We propose a Pareto-based method for predicting RNA secondary structures by minimizing a bi-objective both energy-based and knowledge-based potential. The tool, called \textsc{Biorseo}, outputs the secondary structures from the Pareto set. We use it to compare several approaches to predict secondary structures using inserted RNA modules information: two different module data sources, Rna3Dmotifs and The RNA 3D Motif Atlas, and different ways to score the module insertions are compared: module size, module complexity, or module probability according to models like JAR3D and BayesPairing. We benchmark them against 344 known secondary structures. Some of the tested methods present a good performance, especially on structures containing pseudoknots. They are compared to state of the art tools for secondary structure prediction.\\
26 -\textbf{Availability:} The software is freely provided for Linux on \href{https://github.com/persalteas/biorseo/}{GitHub (https://github.com/persalteas/biorseo/)}, with the datasets. \\ 26 +\textbf{Availability:} The software is freely provided for Linux download on \href{https://evryrna.ibisc.univ-evry.fr/evryrna/biorseo/}{EvryRNA}, with the datasets. \\
27 \textbf{Contact:} \href{louis.becquey@univ-evry.fr}{louis.becquey@univ-evry.fr}\\ 27 \textbf{Contact:} \href{louis.becquey@univ-evry.fr}{louis.becquey@univ-evry.fr}\\
28 -\textbf{Supplementary information:} Supplementary data are available at \textit{Bioinformatics} 28 +\textbf{Supplementary information:} Appendices A,B and C are available at \textit{Bioinformatics}
29 online.} 29 online.}
30 30
31 \maketitle 31 \maketitle
...@@ -59,7 +59,8 @@ One hypothesis about RNA-MoIP's lack of performance is that it cannot distinguis ...@@ -59,7 +59,8 @@ One hypothesis about RNA-MoIP's lack of performance is that it cannot distinguis
59 59
60 To test this hypothesis, we design a method which builds a 2D structure by simultaneously placing base-pairs and modules in a single step, taking into account two objectives: the expected accuracy of the structure in the equilibrium ensemble fold, and a custom function that reflects the number and quality of inserted modules (several models are studied). This method leads to our new tool Biorseo (Bi-Objective RNA Structure Efficient Optimizer). Our approach avoids using a weighted linear combination of the objectives as done in RNA-MoIP (which can miss interesting solutions so-called \textit{non-supported} solutions). In this paper, we use a bi-objective Pareto-based approach, i.e. we identify all the non-dominated structures (the structures for which no other structure scores better on the two objectives). 60 To test this hypothesis, we design a method which builds a 2D structure by simultaneously placing base-pairs and modules in a single step, taking into account two objectives: the expected accuracy of the structure in the equilibrium ensemble fold, and a custom function that reflects the number and quality of inserted modules (several models are studied). This method leads to our new tool Biorseo (Bi-Objective RNA Structure Efficient Optimizer). Our approach avoids using a weighted linear combination of the objectives as done in RNA-MoIP (which can miss interesting solutions so-called \textit{non-supported} solutions). In this paper, we use a bi-objective Pareto-based approach, i.e. we identify all the non-dominated structures (the structures for which no other structure scores better on the two objectives).
61 61
62 -In the next section, this article presents the module models sources, insertion models and objective functions, and the procedure to compare them. Then we present a benchmark of all those variants against reference tools in Section \ref{sec:results}, using a reference dataset (verified structures from the RNA-Strand database). Three well-known reference RNAs are predicted and used to discuss the differences between the methods: \textit{E. coli}'s Gln tRNA, the Guanine riboswitch, and the human telomerase's pseudoknot. Finally, we recommend two prediction methods, and conclude. 62 +In the next section, this article presents the module models sources, insertion models and objective functions, and the procedure to compare them. Then we present a benchmark of all those variants against reference tools in Section \ref{sec:results}, using a reference dataset (verified structures from the RNA-Strand database). Three well-known reference RNAs are predicted and used to discuss the differences between the methods. %: \textit{E. coli}'s Gln tRNA, the Guanine riboswitch, and the human telomerase's pseudoknot.
63 +Finally, we recommend two prediction methods, and conclude.
63 64
64 \begin{figure*}[t] 65 \begin{figure*}[t]
65 \includegraphics[width=\textwidth]{fig/graph_abstract.jpg} 66 \includegraphics[width=\textwidth]{fig/graph_abstract.jpg}
...@@ -79,7 +80,7 @@ Our main procedure is the following: ...@@ -79,7 +80,7 @@ Our main procedure is the following:
79 \item \textbf{Optimization step:} Find a secondary structure that satisfies as much as possible both the expected accuracy of the structure and a criterion taking into account module inclusions, by solving a bi-objective integer linear programming problem, using the previous constraints defined in the previous step. 80 \item \textbf{Optimization step:} Find a secondary structure that satisfies as much as possible both the expected accuracy of the structure and a criterion taking into account module inclusions, by solving a bi-objective integer linear programming problem, using the previous constraints defined in the previous step.
80 \end{itemize} 81 \end{itemize}
81 82
82 -The linear integer programming framework used to define the constraints and solve the resulting optimization problem is similar to previous works like IPknot, Biokop or RNA-MoIP~(\citealp{sato_ipknot:_2011,legendre_bi-objective_2018,reinharz_towards_2012}), but involves new constraints detailed in supplementary material. 83 +The linear integer programming framework used to define the constraints and solve the resulting optimization problem is similar to previous works like IPknot, Biokop or RNA-MoIP~(\citealp{sato_ipknot:_2011,legendre_bi-objective_2018,reinharz_towards_2012}), but involves new constraints detailed in appendix A.
83 Figure \ref{fig:pipeline} summarizes the procedure on a graphical pipeline. 84 Figure \ref{fig:pipeline} summarizes the procedure on a graphical pipeline.
84 85
85 \subsection{Pattern matching step}\label{sec:models} 86 \subsection{Pattern matching step}\label{sec:models}
...@@ -93,8 +94,8 @@ Several methods have been proposed to tackle the issue of finding if a sequence ...@@ -93,8 +94,8 @@ Several methods have been proposed to tackle the issue of finding if a sequence
93 94
94 95
95 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
96 -\subsection{Constraints definition step and integer programming model} \label{sec:ip} 97 +\subsection{Constraints definition step and IP model} \label{sec:ip}
97 -The full list of variables we used to model the problem in an integer linear program and the linear formulation of each constraint are detailed in supplementary material. Here we propose different objective functions to maximize, whose performances are compared in section \ref{sec:results}. 98 +The full list of variables we used to model the problem in an integer linear program and the linear formulation of each constraint are detailed in Appendix A. Here we propose different objective functions to maximize, whose performances are compared in section \ref{sec:results}.
98 99
99 \paragraph{Notations} ~ We call a \textit{component} a piece of strand which forms an unpaired portion of a module. Components of a module are linked together by canonical base-pairs at their extremities to form a loop. Let $x$ be a module which could be inserted at some defined position in the sequence. Let $\|x\|$ bet the number of components of this module, and $k_{x,i}$ the nucleotide count of the $i$th component of $x$. When a scoring model is used (JAR3D or BayesPairing), we denote $p(x)$ the score value of $x$ inserted at the defined position. Let $p_{uv}$ be the probability for nucleotides $u$ and $v$ (with $v>u+3$) to form a canonical base-pair. We use NUPACK's dynamic programming scheme~(\citealp{dirksAlgorithmComputingNucleic2004}), which supports pseudoknots, to compute such probabilities. We denote $y^u_v$ the binary decision variable indicating that these nucleotides do form a canonical base pair, and $C^x_1$ the decision binary variable indicating whether the module $x$ will be inserted or not. The resolution of the linear program outputs solutions by fixing definitive values for the different $y^u_v$ and $C^x_1$. 100 \paragraph{Notations} ~ We call a \textit{component} a piece of strand which forms an unpaired portion of a module. Components of a module are linked together by canonical base-pairs at their extremities to form a loop. Let $x$ be a module which could be inserted at some defined position in the sequence. Let $\|x\|$ bet the number of components of this module, and $k_{x,i}$ the nucleotide count of the $i$th component of $x$. When a scoring model is used (JAR3D or BayesPairing), we denote $p(x)$ the score value of $x$ inserted at the defined position. Let $p_{uv}$ be the probability for nucleotides $u$ and $v$ (with $v>u+3$) to form a canonical base-pair. We use NUPACK's dynamic programming scheme~(\citealp{dirksAlgorithmComputingNucleic2004}), which supports pseudoknots, to compute such probabilities. We denote $y^u_v$ the binary decision variable indicating that these nucleotides do form a canonical base pair, and $C^x_1$ the decision binary variable indicating whether the module $x$ will be inserted or not. The resolution of the linear program outputs solutions by fixing definitive values for the different $y^u_v$ and $C^x_1$.
100 101
...@@ -105,7 +106,7 @@ Let $X$ be the set of all our decision variables, then the different objective f ...@@ -105,7 +106,7 @@ Let $X$ be the set of all our decision variables, then the different objective f
105 \begin{equation} f_{1C}(X) = \sum_{x} p(x) \times C^x_1 \label{eq:C}\end{equation} 106 \begin{equation} f_{1C}(X) = \sum_{x} p(x) \times C^x_1 \label{eq:C}\end{equation}
106 \begin{equation}f_{1D}(X) = \sum_{x} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\label{eq:D}\end{equation} 107 \begin{equation}f_{1D}(X) = \sum_{x} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\label{eq:D}\end{equation}
107 108
108 -Regarding the second objective, aimed at maximizing the expected accuracy of the structures, we use $f_2(X) = \sum y^u_v \times p_{uv} \times I[p_{uv}>\theta]$. As first proposed by~(\citealp{sato_ipknot:_2011}), $f_2$ uses a parameter $\theta$ to ignore very unlikely base-pairs. This prevents the explosion of the number of variables and allows a fast resolution of the IP problem. 109 +Regarding the second objective, aimed at maximizing the expected accuracy of the structures, we use $f_2(X) = \sum y^u_v \times p_{uv} \times I[p_{uv}>\theta]$. As first proposed by~(\citealp{sato_ipknot:_2011}), $f_2$ uses a parameter $\theta = 0.001$ to ignore very unlikely base-pairs. This prevents the explosion of the number of variables and allows a fast resolution of the IP problem.
109 110
110 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 111 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
111 112
...@@ -159,67 +160,53 @@ $s$:= maximize($f_1$, $\lambda_{min}$, $\lambda_{max}$, F)\; ...@@ -159,67 +160,53 @@ $s$:= maximize($f_1$, $\lambda_{min}$, $\lambda_{max}$, F)\;
159 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 160 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
160 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 161 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
161 \section{Results}\label{sec:results} 162 \section{Results}\label{sec:results}
162 -
163 \begin{figure*}[!tbp] 163 \begin{figure*}[!tbp]
164 \includegraphics[width=\textwidth]{fig/Benchmark.jpg} 164 \includegraphics[width=\textwidth]{fig/Benchmark.jpg}
165 \caption{Boxplots of the best MCC over the proposed solutions for each of the RNAs, for all method variants. Line (A) shows the methods that cannot find pseudoknots: RNAsubopt, RNA-MoIP, and the 14 variants of Biorseo's bi-objective methods with a constraint that explicitly forbids pseudoknots. Line (B) shows methods which allow their prediction: Biokop, and the 14 variants without the no-pseudoknot constraint. The left block gathers methods which use module data from Rna3Dmotifs (\citealp{djelloul_automated_2008}). The right one gathers those which use modules from the RNA 3D Motif Atlas (\citealp{petrov_automated_2013}). Boxplots surrounded by a dotted red frame use direct pattern matching to detect insertion sites, but do not score the sites. Those surrounded by a continuous blue frame score the sites with JAR3D (\citealp{zirbel_identifying_2015}) to score modules on loop sequences found by RNAsubopt. The remaining surrounded by a dashed green frame use the BayesPairing score~(\citealp{sarrazin2019automated}).} 165 \caption{Boxplots of the best MCC over the proposed solutions for each of the RNAs, for all method variants. Line (A) shows the methods that cannot find pseudoknots: RNAsubopt, RNA-MoIP, and the 14 variants of Biorseo's bi-objective methods with a constraint that explicitly forbids pseudoknots. Line (B) shows methods which allow their prediction: Biokop, and the 14 variants without the no-pseudoknot constraint. The left block gathers methods which use module data from Rna3Dmotifs (\citealp{djelloul_automated_2008}). The right one gathers those which use modules from the RNA 3D Motif Atlas (\citealp{petrov_automated_2013}). Boxplots surrounded by a dotted red frame use direct pattern matching to detect insertion sites, but do not score the sites. Those surrounded by a continuous blue frame score the sites with JAR3D (\citealp{zirbel_identifying_2015}) to score modules on loop sequences found by RNAsubopt. The remaining surrounded by a dashed green frame use the BayesPairing score~(\citealp{sarrazin2019automated}).}
166 \label{fig:upgrades} 166 \label{fig:upgrades}
167 \end{figure*} 167 \end{figure*}
168 -
169 All the methods introduced return an ensemble of possible secondary structures for a given input sequence. We compare them in a benchmark over short RNA structures and a few well-known application cases. 168 All the methods introduced return an ensemble of possible secondary structures for a given input sequence. We compare them in a benchmark over short RNA structures and a few well-known application cases.
170 -
171 \subsection{Benchmark protocol} \label{sec:bench} 169 \subsection{Benchmark protocol} \label{sec:bench}
172 -We repeated the benchmark twice: first, by forbidding explicitly the formation of pseudoknots with additional constraints (for fair comparison with RNA-MoIP). Then, a second one without such limitation, to reach maximum performance.
173 170
174 \paragraph{Materials} 171 \paragraph{Materials}
175 -The tool has been conceived to be used by a regular scientist on a desktop computer. All computations were run on a workstation with AMD Ryzen 2700X (16 threads @4.3GHz) CPU, 16 GB of RAM. A prediction typically takes a few seconds, sometimes minutes. The time required grows with both the nucleotide count and the number of loops. The objective functions $f_{1A}$ and $f_{1B}$ were sometimes not discriminative enough and equally ranked a large number of module propositions, leading to combinatorial issues. For that reason, we arbitrarily stopped the jobs exceeding 500 structures in the Pareto set, because they would require over 2 hours of computation time to complete. 172 +The tool can be used by a regular scientist on a desktop computer. All computations were performed on a workstation (16 threads @4.3GHz) CPU, 16 GB of RAM. The RAM typically limits the size of the RNAs the methods can process. RNAs up to 230 bases are fine in our case. A prediction typically takes a few seconds, sometimes minutes. The time required grows with both the nucleotide count and the number of loops. The objective functions $f_{1A}$ and $f_{1B}$ were sometimes not discriminative enough and equally ranked a large number of module propositions, leading to combinatorial issues. For that reason, we arbitrarily stopped the jobs exceeding 500 structures in the Pareto set, because they would require over 2 hours of computation time to complete.
176 173
177 \paragraph{Data sources} \label{sec:data} 174 \paragraph{Data sources} \label{sec:data}
178 -A set of RNA secondary structures was extracted from the RNA-Strand database ~(\citealp{andronescu2008rna}). We selected the RNAs for which experimental proof of the structure exists, with size varying between 10 and 100 nucleotides. Sequences containing consensus letters, for example R for a purine (A or G), or modified nucleotides (P, T, I in our case) were discarded. %In addition, we add a collection of 264 pseudoknotted RNAs taken from the Pseudobase database~(\citealp{van2000pseudobase}), covering all pseudoknot families, of length between 10 and 100 nucleotides. 175 +A first dataset of RNA secondary structures was extracted from the RNA-Strand database ~(\citealp{andronescu2008rna}). We selected the RNAs for which experimental proof of the structure exists, with size varying between 10 and 100 nucleotides. Sequences containing modified nucleotides (P, T, I in our case) were discarded. The resulting set contains 334 secondary structures of various RNA families, 74 of them containing pseudoknots. We repeated the experiments twice: first, by forbidding explicitly the formation of pseudoknots with additional constraints (for fair comparison with RNA-MoIP). Then, a second one without such limitation, to reach maximum performance. Due to the combinatorial issues described above, only 291 (resp. 294) of the RNAs have been predicted by all the proposed methods (the missing ones often being a combination of direct pattern-matching or JAR3D with $f_{1A}$ or $f_{1B}$) when forbidding (resp. allowing) pseudoknots. We will use these set of results for comparison. \\
179 -The final dataset contains 334 secondary structures of various RNA families, 74 of them containing pseudoknots. Due to the combinatorial issues described above, only 291 (resp. 294) of them have been predicted by all the proposed methods (the missing ones often being a combination of direct pattern-matching or JAR3D with $f_{1A}$ or $f_{1B}$) when forbidding (resp. allowing) pseudoknots. We will use these set of results for comparison. 176 +In addition, we add a second collection of 264 pseudoknotted-only RNAs from the Pseudobase database~(\citealp{van2000pseudobase}), covering all pseudoknot families, of length also comprised between 10 and 100 nucleotides.\\
177 +To complete the large benchmark, we have a deeper look at very-well known structures to check if relevant combinations of models are still able to predict them correctly.
178 +We used a Gln tRNA from E. coli (RNA-Strand code PDB\_00376), a Guanine riboswitch (RNA-Strand code PDB\_01023), and the pseudoknot of the human telomerase (PDB\_00857). The tRNA is unpseudoknotted, the G riboswitch contains a hard-to-predict HHH type pseudoknot, and the telomerase pseudoknot is a simple H type pseudoknot.
180 179
181 -In addition to the large benchmark, we have a deeper look at very-well known structures to check if relevant combinations of models are still able to predict them correctly.
182 -We used a Gln tRNA from E. coli (RNA-Strand code PDB\_00376), a Guanine riboswitch (RNA-Strand code PDB\_01023), and the pseudoknot of the human telomerase (PDB\_00857). The tRNA contains a tiny, improbable pseudoknot. The G riboswitch is unpseudoknotted, and the telomerase pseudoknot is a simple H type pseudoknot.
183 -
184 -
185 \paragraph{Reference comparison methods} 180 \paragraph{Reference comparison methods}
186 To study the usefulness of the data sources, objective functions, and module placement methods, we added state-of-the art tools to the comparison. The same RNA sequences were submitted to RNA-MoIP for direct performance comparison. We used RNAsubopt as a reference method without pseudoknot support, because it is fast, widely used, easy to understand and returns several solutions. We used Biokop, the bi-objective integer programming framework, as a reference method for prediction of secondary structures with pseudoknots. Both tools over-perform other state-of-the-art tools in their respective categories, see the appropriate papers (\citealp{lorenz2011viennarna, legendre_bi-objective_2018}) for more benchmarks against other tools. 181 To study the usefulness of the data sources, objective functions, and module placement methods, we added state-of-the art tools to the comparison. The same RNA sequences were submitted to RNA-MoIP for direct performance comparison. We used RNAsubopt as a reference method without pseudoknot support, because it is fast, widely used, easy to understand and returns several solutions. We used Biokop, the bi-objective integer programming framework, as a reference method for prediction of secondary structures with pseudoknots. Both tools over-perform other state-of-the-art tools in their respective categories, see the appropriate papers (\citealp{lorenz2011viennarna, legendre_bi-objective_2018}) for more benchmarks against other tools.
187 182
188 -\paragraph{Metrics} ~ We compute the Matthews correlation coefficient (MCC) between the real secondary structure and every proposition. The coefficient is defined as 183 +\paragraph{Metrics} ~ We compute the Matthews correlation coefficient (MCC) between the real secondary structure and every proposed structure. The coefficient is defined as
189 \begin{equation} 184 \begin{equation}
190 MCC = \frac{TP. TN - FP. FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}. \label{eq:MCC} 185 MCC = \frac{TP. TN - FP. FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}. \label{eq:MCC}
191 \end{equation} 186 \end{equation}
192 -Then, we keep the best MCC value found as a metric of the method's performance. This reflects if the true structure is included or not in the Pareto set. Averaging over the structures would be a nonsense, because some RNAs may exist in several meta-stable states which are very different in their list of base-pairs. Here, we measure if one of the states, which is reported in RNA-Strand as "true" structure, has been found. The choice of MCC over accuracy or F1 score is justified by the very large difference between the size of the classes: there exist much more negative base-pairs (pairs of nucleotides that do not interact) than positive ones in any secondary structure. 187 +Then, we keep the best MCC value found over the set of proposed structures as a metric of the method's performance. This reflects if the true structure is included or not in the Pareto set. Here, we measure if one of the states, which is reported in RNA-Strand as "true" structure, has been found. For comprehensiveness, results with average MCC are also provided in Appendix B, but it is hard to interpret what this average MCC represents. The choice of MCC over accuracy or F1 score is justified by the very large difference between the size of the classes: there exist much more negative base-pairs (pairs of nucleotides that do not interact) than positive ones in any secondary structure.
193 -
194 -\begin{table*}[!t]
195 -\processtable{MCC results for study cases. Pseudoknots are allowed. \label{Tab:01}} {
196 -\begin{tabular}{@{}rlllllll@{}}\toprule & RNAsubopt & RNA-MoIP & BiokoP & Rna3Dmotifs & Rna3Dmotifs & RNA 3D Motif Atlas & RNA 3D Motif Atlas\\
197 - & & & & + Direct P.M. & + BayesPairing & + JAR3D & + BayesPairing \\\midrule
198 -PDB\_00376 & 0.68 & 0.68 & 0.67 & 0.72 (A,B) & 0.74 (B,C,D), 0.71 (A) & 0.74 (A,C,D), 0.72 (B) & 0.76 (\textit{all})\\
199 -PDB\_01023 & 0.86 & 0.86 & 0.59 & 0.79 (A,B) & 0.29 (\textit{all}) & 0.82 (\textit{all}) & 0.82 (\textit{all})\\
200 -PDB\_00857 & 0.77 & 0.77 & 1.0 & 0.97 (B), 0.77 (A) & 0.97 (\textit{all}) & 0.97 (\textit{all}), & 0.97 (\textit{all})\\\botrule
201 -\end{tabular}}{The first line gives the results for E.\textit{coli} tRNA Gln (PDB\_00376), the second line for the glycine riboswitch (PDB\_01023), and the third for the human telomerase's pseudoknot (PDB\_00857). We observe that the best structure is often the same when we use the different objective functions $f_{1A}, f_{1B}, f_{1C}, f_{1D}$, but the rest of the set can be different.}
202 -\end{table*}
203 -
204 188
205 \subsection{Benchmark results} 189 \subsection{Benchmark results}
206 Performance results under the form of best MCC are summarized in Figure \ref{fig:upgrades}. 190 Performance results under the form of best MCC are summarized in Figure \ref{fig:upgrades}.
207 Majority of the RNAs were predicted with similar performance among the methods, including methods that do not use module information. 191 Majority of the RNAs were predicted with similar performance among the methods, including methods that do not use module information.
208 192
209 -No data source, nor objective function taken alone performs significantly better than the other ones. No one distinguishes itself alone to improve the performance. 193 +No data source, nor objective function taken alone performs significantly better than the other ones.
210 194
211 -\paragraph{Methods without pseudoknots, comparison to RNAsubopt} ~ No method reaches RNAsubopt's scores. The most performing model is the use of The RNA 3D Motif Atlas modules, placed with JAR3D, and scored only with the JAR3D score ($f_{1C}$). We also notice that the 4 models (including this one) which use JAR3D return very small sets of results, most of them being one optimal solution, while RNAsubopt returns from one to ten solutions (with our dataset and default settings). 195 +\paragraph{Methods without pseudoknots, comparison to RNAsubopt} ~ No method reaches RNAsubopt's scores. The most performing model is the use of the RNA 3D Motif Atlas modules, placed with JAR3D, and scored only with the JAR3D score ($f_{1C}$). We also notice that the 4 models (including this one) which use JAR3D return very small sets of results, most of them being one optimal solution, while RNAsubopt returns from one to ten solutions (with our dataset and default settings).
212 196
213 -\paragraph{With pseudoknots, comparison to Biokop} ~ The number of solutions returned doubles for every method compared to its no-pseudoknot version. Most of the RNAs are predicted with small knots when the method allows it. As the bottom line of Figure \ref{fig:upgrades} shows, the methods which use BayesPairing do not find as many right structures than Biokop, which performs better even without module information. On the other hand, the methods which use direct pattern-matching (like RNA-MoIP did) and the four ones which use RNAsubopt+JAR3D reach higher performance. The largest improvements concern Biorseo + Rna3Dmotifs + $f_{1B}$ and Biorseo + The RNA 3D Motif Atlas + JAR3D + $f_{1B}$. We apply Wilcoxon signed rank tests (non-parametric test for paired samples) to assert these methods distributions of results differ from Biokop (null hypothesis: "The position parameter of the distribution of the differences between the two paired samples is null"): p-values are $1.5\times 10^{-2}$ for the first method using Rna3Dmotifs, and $2.5\times 10^{-3}$ for the second with JAR3D. Other methods have more statistically significant differences with BiokoP, but smaller improvements in average, so we do not provide all the less interesting statistical tests results. 197 +\paragraph{With pseudoknots, comparison to Biokop} ~ The number of solutions returned doubles for every method compared to its no-pseudoknot version. Most of the RNAs are predicted with small knots as the method allows it. As the bottom line of Figure \ref{fig:upgrades} shows, the methods which use BayesPairing do not find as many right structures than Biokop, which performs better even without module information. On the other hand, the methods which use direct pattern-matching (like RNA-MoIP did) and the four ones which use RNAsubopt+JAR3D reach higher performance. The largest improvements concern Biorseo + Rna3Dmotifs + $f_{1B}$ and Biorseo + The RNA 3D Motif Atlas + JAR3D + $f_{1B}$. We apply Wilcoxon signed rank tests (non-parametric test for paired samples) to assert these methods distributions of results differ from Biokop (null hypothesis: "The position parameter of the distribution of the differences between the two paired samples is null"): p-values are $1.5\times 10^{-2}$ for the first method using Rna3Dmotifs, and $2.5\times 10^{-3}$ for the second with JAR3D. Other methods have smaller improvements in average, so we do not provide all the statistical tests results.
214 198
215 -\subsection{Results of the study cases} 199 +\begin{figure}[t]
216 -The results about very well described structures (Table~\ref{Tab:01}) are consistent with the general benchmark. Biorseo used with Rna3Dmotifs and direct pattern matching predicts 2D structures as well as RNAsubopt. So does it when used with the RNA 3D Motif Atlas and JAR3D. 200 +\centerline{\includegraphics[width=\linewidth]{fig/pseudobase_zoom.jpg}}
217 -BayesPairing has a low performance when used with Rna3Dmotifs and a surprising good performance with the Motif Atlas, which cannot be generalized given the larger benchmark data. 201 +\caption{Boxplots of the best MCC found by the different predictors, on the dataset of 264 pseudoknotted-only RNAs from Pseudobase~(\citealp{van2000pseudobase}. Some computations are incomplete; Biokop succeeded for 201 RNAs, JAR3D+$f_{1A}$ for 249, and JAR3D+$f_{1B}$ for 248. \label{fig:pseudobase}}
202 +\end{figure}
203 +The results on the dataset of pseudoknotted-only structures are presented on Figure~\ref{fig:pseudobase}. Whatever the Biorseo variant, the median best MCC is over 0.8, which is a significant improvement compared to Biokop (0.73, with larger variance). However, the variants are very similar and again, no module source, pattern-matching method nor objective function distinguishes itself.
218 204
219 -The tRNA is an example of structure which is approximately correctly predicted by Biorseo, as well as other tools. 205 +\subsection{Results of the study cases}
220 -The G riboswitch allows to show that Biorseo is less likely to insert false positive pseudo-knots than Biokop (which inserted one, resulting in a low score, structures not shown). 206 +The results about very well described structures are consistent with the general benchmark. Biorseo used with Rna3Dmotifs and direct pattern matching predicts 2D structures as well as RNAsubopt. So does it when used with the RNA 3D Motif Atlas and JAR3D. BayesPairing has a low performance when used with Rna3Dmotifs and a surprising good performance with the Motif Atlas, which cannot be generalized given the larger benchmark data.
207 +The tRNA is an example of structure which is approximately correctly predicted by Biorseo, as well as other tools. The G riboswitch allows to show that Biorseo is less likely to insert false positive pseudo-knots than Biokop (which inserted several, resulting in a low score).
221 The telomerase pseudoknot is correctly predicted by all methods that support pseudoknots, including Biorseo. 208 The telomerase pseudoknot is correctly predicted by all methods that support pseudoknots, including Biorseo.
222 - 209 +Detailed results including structures, number of solutions and computation times are provided in appendix C.
223 210
224 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 211 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
225 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 212 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
...@@ -227,15 +214,15 @@ The telomerase pseudoknot is correctly predicted by all methods that support pse ...@@ -227,15 +214,15 @@ The telomerase pseudoknot is correctly predicted by all methods that support pse
227 214
228 \paragraph{Without pseudoknots, comparison to RNA-MoIP} ~ An interesting point is the improvement between RNA-MoIP and the bi-objective method which uses direct-pattern matching to spot insertion sites and $f_{1A}$ to score the insertions. This method only differs from RNA-MoIP because it is bi-objective. The improvement is small, but statistically significant with a Wilcoxon's signed rank test p-value of 0.024. Then, we can conclude that the Pareto approach really improves the structure prediction by itself. This result supports our hypothesis about RNA-MoIP breaking important basepairs. Unfortunately, this improvement in average is counterbalanced by an increase in variance. 215 \paragraph{Without pseudoknots, comparison to RNA-MoIP} ~ An interesting point is the improvement between RNA-MoIP and the bi-objective method which uses direct-pattern matching to spot insertion sites and $f_{1A}$ to score the insertions. This method only differs from RNA-MoIP because it is bi-objective. The improvement is small, but statistically significant with a Wilcoxon's signed rank test p-value of 0.024. Then, we can conclude that the Pareto approach really improves the structure prediction by itself. This result supports our hypothesis about RNA-MoIP breaking important basepairs. Unfortunately, this improvement in average is counterbalanced by an increase in variance.
229 216
230 -\paragraph{Regarding pseudoknots} ~ The support of pseudoknots allows a small increase in performance, because we return more solutions, some with pseudoknots, and some without. As we are looking at the maximum MCC here, the appropriate solution has been selected for each RNA. 217 +\paragraph{Regarding pseudoknots} ~ The support of pseudoknots allows an increase in performance, because we return more solutions, some with pseudoknots, and some without. As we are looking at the maximum MCC here, the appropriate solution has been selected for each RNA.
231 -Biorseo's reduced number of false positive pseudoknots compared to Biokop can be explained directly by the insertion of modules in loops: as we forbid explicitly any base-pair on nucleotides inside an inserted module's component, we prevent known loops to form pseudoknots. The drawback of this selectivity is the low ability to predict kissing hairpins (HHH type pseudoknots), precisely because they require loops to interact. 218 +Biorseo's reduced number of false positive pseudoknots compared to Biokop can be explained directly by the insertion of modules in loops: as we forbid explicitly any base-pair on nucleotides inside an inserted module's component, we prevent known loops to form pseudoknots. The drawback of this selectivity is the low ability to predict kissing hairpins (HHH type pseudoknots), precisely because they require loops to interact, which happened with our G riboswitch prediction. But for general purpose, given the performance difference between Figures \ref{fig:upgrades} and \ref{fig:pseudobase}, it seems that Biorseo is better on pseudoknotted RNAs.
232 -However, pseudoknot prediction quality is difficult to assess with a metric like MCC, because a pseudoknot could be involved in only two or three base-pairs. Finding them or not does not alter much the MCC even if the structure is much more right or wrong from a biological point of view. Unfortunately, no automated assertion method exist yet. A more accurate description of pseudoknot prediction performance would have required manual validation of every occurrence in every structure, which is too much work to achieve on such large datasets. 219 +However, pseudoknot prediction quality is difficult to assess with a metric like MCC, because a pseudoknot could be involved in only two or three base-pairs. Finding them or not does not alter much the MCC even if the structure is much more right or wrong from a biological point of view. Unfortunately, no automated verification method exist yet. A more accurate description of pseudoknot prediction performance would have required manual validation of every occurrence in every structure, which is too much work to achieve on such large datasets.
233 220
234 \paragraph{On the objective functions} ~ 221 \paragraph{On the objective functions} ~
235 -Regarding objective functions to include modules, the different criteria proposed seem to give comparable results at first sight regarding the average performance and the dispersion. However, an important difference between $f_{1A}$, $f_{1B}$ on one side, and $f_{1C}$, $f_{1D}$ on the other side, is about the computation time. As $f_{1A}$, $f_{1B}$ do not use a score to rank potential module insertion sites, every modules of the same size can be equally inserted. When the RNA presents several loops, the combinatorial possibilities grow fast with the number of modules in the dataset. Therefore, the number of undominated solutions can reach several hundreds or thousands even for short sequences. As explained in section \ref{sec:bench}, some of the computations never ended because of combinatorial issues with those objectives. Such large Pareto sets are not informative for our application, because they consist in very redundant secondary structures with different module references, which are counted only for one secondary structure solution at the end. On the other hand, $f_{1C}$ and $f_{1D}$ require the run of an additional tool (JAR3D or BayesPairing) to score the insertion sites. Given an RNA length, a compromise must be found . 222 +Regarding objective functions to include modules, the different criteria proposed seem to give comparable results at first sight regarding the average performance and the dispersion. However, an important difference between $f_{1A}$, $f_{1B}$ on one side, and $f_{1C}$, $f_{1D}$ on the other side, is about the computation time. As $f_{1A}$, $f_{1B}$ do not use a score to rank potential module insertion sites, every modules of the same size can be equally inserted. When the RNA presents several loops, the combinatorial possibilities grow fast with the number of modules in the dataset. Therefore, the number of undominated solutions can reach several hundreds or thousands even for short sequences. As evocated in section \ref{sec:bench}, some of the computations never ended because of combinatorial issues with those objectives. Such large Pareto sets are not informative for our application, because they consist in very redundant secondary structures with different module references, which are counted only for one secondary structure solution at the end. On the other hand, $f_{1C}$ and $f_{1D}$ require the run of an additional tool (JAR3D or BayesPairing) to score the insertion sites. Given an RNA, a compromise must be found according to its length and amount of loops.
236 223
237 \paragraph{The bias with JAR3D} ~ One should keep in mind that JAR3D takes as input the sequences of RNA loops to score modules against them. We detected the loops in the RNA sequence with RNAsubopt. This use of JAR3D is biased, because we score modules on sequence portions that we already know unlikely to form stems and likely to form loops. Therefore, the information brought by the insertion of a module is low. 224 \paragraph{The bias with JAR3D} ~ One should keep in mind that JAR3D takes as input the sequences of RNA loops to score modules against them. We detected the loops in the RNA sequence with RNAsubopt. This use of JAR3D is biased, because we score modules on sequence portions that we already know unlikely to form stems and likely to form loops. Therefore, the information brought by the insertion of a module is low.
238 -Then, the enthusiasm about the bi-objective method with JAR3D and $f_{1C}$ (without pseudoknots) has to be moderate. It actually outputs almost the same secondary structures than RNAsubopt, discarding certain ones sometimes. As that method was the only interesting result without pseudoknots, we can argue that including known modules is not a general way to improve secondary structure prediction without pseudoknots. For every method, the average best MCC is below RNAsubopt, and the performance gain obtained on some structures is counterbalanced by the loss on approximately the same number of RNAs or more. 225 +Then, the enthusiasm about the bi-objective method with JAR3D and $f_{1C}$ (without pseudoknots) has to be moderate. It actually outputs almost the same secondary structures than RNAsubopt, discarding certain ones sometimes. As that method was the only interesting result without pseudoknots, we can argue that including known modules is not a general way to improve secondary structure prediction without pseudoknots. For every method, the median best MCC is below RNAsubopt, and the performance gain obtained on some structures is counterbalanced by the loss on approximately the same number of RNAs or more.
239 226
240 We also observe that using The RNA 3D Motif Atlas with JAR3D has a significantly different behavior than the other methods: first, it returns a very small number of solutions (1 or 2 most of the time). Then, the best structure is almost every-time the one that has the higher number of modules, while it is not the case for the other methods. This is a good point for method JAR3D-$f_{1C}$ which performs almost as well as RNAsubopt by returning only one or two structures. An explanation is that JAR3D is selective of a few module insertion sites, sites that were first perfectly predicted to be loops by RNAsubopt (as discussed earlier). This confirms the use of module information is not always relevant and that the energy criteria brings almost all the information. The modules only sometimes allow to reduce the number of solutions. 227 We also observe that using The RNA 3D Motif Atlas with JAR3D has a significantly different behavior than the other methods: first, it returns a very small number of solutions (1 or 2 most of the time). Then, the best structure is almost every-time the one that has the higher number of modules, while it is not the case for the other methods. This is a good point for method JAR3D-$f_{1C}$ which performs almost as well as RNAsubopt by returning only one or two structures. An explanation is that JAR3D is selective of a few module insertion sites, sites that were first perfectly predicted to be loops by RNAsubopt (as discussed earlier). This confirms the use of module information is not always relevant and that the energy criteria brings almost all the information. The modules only sometimes allow to reduce the number of solutions.
241 228
...@@ -253,11 +240,13 @@ For that reason, we recommend three models : if the user does not expect pseudok ...@@ -253,11 +240,13 @@ For that reason, we recommend three models : if the user does not expect pseudok
253 We developed a general bi-objective method to benchmark different sources of RNA module models (the RNA 3D Motif Atlas and Rna3Dmotifs), different methods to place them in sequences (direct pattern matching, BayesPairing, and JAR3D), and different scoring functions. The bi-objective method uses the expected accuracy of the structure, and the previous scoring functions to select relevant secondary structures. 240 We developed a general bi-objective method to benchmark different sources of RNA module models (the RNA 3D Motif Atlas and Rna3Dmotifs), different methods to place them in sequences (direct pattern matching, BayesPairing, and JAR3D), and different scoring functions. The bi-objective method uses the expected accuracy of the structure, and the previous scoring functions to select relevant secondary structures.
254 241
255 The results show that no data source prevails. They also show that the use of module information is irrelevant to predict structures without pseudoknots. 242 The results show that no data source prevails. They also show that the use of module information is irrelevant to predict structures without pseudoknots.
256 -The real interest would be when looking for potential pseudoknots, where several of our methods can improve the prediction performance (and computation times) compared to state-of-the-art tools. 243 +The real interest would be when looking for potential pseudoknots, where several of our methods improve the prediction performance (and computation times) compared to state-of-the-art tools.
244 +
245 +Some of our models over-perform RNA-MoIP, a previous attempt to predict better secondary structures using module information from Rna3Dmotifs and a linear combination of two objectives into a scoring function. Our simplest best-performing new method could be interpreted as an upgraded RNA-MoIP with a real bi-objective framework and a better module insertion objective, which predicts the base pairs and the module insertions in a row, preventing the insertion to break important base-pairs.
257 246
258 -Some of our models over-perform RNA-MoIP, a previous attempt to predict better secondary structures using module information from Rna3Dmotifs and a linear combination of two objectives into a scoring function. Our simplest best-performing new method could be interpreted as an upgraded RNA-MoIP with updated data (there is a 10-fold increase in the number of solved RNA crystal structures between 2008 and 2018) and a real bi-objective framework, which predicts the base pairs and the module insertions in a row, preventing the insertion to break important base-pairs. 247 +All Biorseo variants are available as a web service or for download on the EvryRNA website.
259 248
260 -Improvement perspectives now rely on the hope than newer databases like CaRNAval~(\citealp{reinharz2018mining}), containing more recent and more diverse module information, to really bring more information to assist the energy criteria.\\ 249 +Improvement perspectives now rely on the hope than newer databases like CaRNAval~(\citealp{reinharz2018mining}), containing more recent and more diverse module information (there is a 10-fold increase in the number of solved RNA crystal structures between the original Rna3Dmotifs dataset from 2008, and 2018), to really bring more information to assist the energy criteria.\\
261 250
262 \bibliographystyle{natbib} 251 \bibliographystyle{natbib}
263 %\bibliographystyle{achemnat} 252 %\bibliographystyle{achemnat}
......
1 \documentclass{article} 1 \documentclass{article}
2 \usepackage[utf8]{inputenc} 2 \usepackage[utf8]{inputenc}
3 +\usepackage{graphicx}
3 \usepackage{amsmath} 4 \usepackage{amsmath}
4 \usepackage{stmaryrd} % llbracket, rrbracket 5 \usepackage{stmaryrd} % llbracket, rrbracket
5 \usepackage{siunitx} % SI units 6 \usepackage{siunitx} % SI units
...@@ -17,7 +18,7 @@ The constraints have been rewritten by us, but are inspired by works like IPknot ...@@ -17,7 +18,7 @@ The constraints have been rewritten by us, but are inspired by works like IPknot
17 \paragraph{Extended notations} ~ Here we repeat the definition of the variables that we already used in the article, and we use a few more, that also are defined:\\ 18 \paragraph{Extended notations} ~ Here we repeat the definition of the variables that we already used in the article, and we use a few more, that also are defined:\\
18 Let $n$ be the number of nucleotides in the query RNA sequence $s$.\\ 19 Let $n$ be the number of nucleotides in the query RNA sequence $s$.\\
19 Let $M$ be the set of modules that could be inserted in $s$.\\ 20 Let $M$ be the set of modules that could be inserted in $s$.\\
20 -Let $x$ be a module of $M$, $\|x\|$ be the number of distinct components of $x$, and $p(x)$ the associated score of insertion given by JAR3D for that motif inserted at a particular position.\\ 21 +Let $x$ be a module of $M$, $\|x\|$ be the number of distinct components of $x$, and $p(x)$ the associated score of insertion given by JAR3D or BayesPairing for that motif inserted at a particular position.\\
21 Let $P_{x,i}$ be the position in $s$ where we can insert the $i$th component of module $x$.\\ 22 Let $P_{x,i}$ be the position in $s$ where we can insert the $i$th component of module $x$.\\
22 As the same module model can be inserted several times in $s$, several different $x$ modules in $M$ may refer to the same theoretical module, but inserted at different positions.\\ 23 As the same module model can be inserted several times in $s$, several different $x$ modules in $M$ may refer to the same theoretical module, but inserted at different positions.\\
23 Let $k_{x,i}$ be the size in nucleotides of that $i$th component of $x$.\\ 24 Let $k_{x,i}$ be the size in nucleotides of that $i$th component of $x$.\\
...@@ -114,4 +115,179 @@ We do it by adding iteratively, for every structure $s^*$ found, the following c ...@@ -114,4 +115,179 @@ We do it by adding iteratively, for every structure $s^*$ found, the following c
114 \end{equation} 115 \end{equation}
115 116
116 It ensures that at least one of the decision variables differs from $s^*$. 117 It ensures that at least one of the decision variables differs from $s^*$.
118 +
119 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
120 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
121 +\section{Average MCC of the method's variants}
122 +Instead of looking at the best MCC to see if the true structure has been found in the Pareto set, one can look at the average MCC over the Pareto set.
123 +
124 +We provide such results to satisfy the reader's curiosity, but this average is hard to interpret.
125 +The Pareto set is supposed to propose several solutions that could be several meta-stable state, but there is no reason for these states to be close one to another, nor to be close to the "true" structure that has been observed and saved in the database.
126 +A possible interpretation is the average distance of the meta-stable states to the "true" structure, if and only if we assume the predictions are correct.
127 +
128 +\hspace{-1cm}
129 +\includegraphics[width=\textwidth]{fig/Benchmark_avg.jpg}
130 +
131 +\hspace{-1cm}
132 +\includegraphics[width=1.05\textwidth]{fig/pseudobase_avg.jpg}
133 +
134 +(A) is the RNAstrand dataset for methods which do not support pseudoknots (computations succeeded for all methods for 291 RNAs), (B) is the same dataset but with pseudoknot support (294 RNAs), and (C) is the Pseudobase dataset of 264 RNAs.
135 +
136 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
137 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
138 +\section{Study cases results}
139 +Here we provide the structures and statistics of the 3 well known RNAs that we studied in detail.
140 +
141 +\subsection{General statistics}
142 +
143 +The first line in Table \ref{Tab:01} gives the results for E.\textit{coli} tRNA Gln (PDB\_00376), the second line for the glycine riboswitch (PDB\_01023), and the third for the human telomerase's pseudoknot (PDB\_00857). We observe that the best structure is often the same accross the different objective functions $f_{1A}, f_{1B}, f_{1C}, f_{1D}$, but the rest of the set can be different in number of solutions and diversity.
144 +
145 +\begin{table*}[h]
146 +\caption{Best MCC results for study cases. Pseudoknots are allowed. \label{Tab:01}}
147 +\vspace{5mm}
148 +\begin{tabular}{@{}r|lll|@{}} & RNAsubopt & RNA-MoIP & BiokoP \\
149 + & & & \\\hline
150 +PDB\_00376 & 0.68 & 0.68 & 0.67 \\
151 +PDB\_01023 & 0.86 & 0.86 & 0.59 \\
152 +PDB\_00857 & 0.77 & 0.77 & 1.0 \\
153 +\end{tabular}
154 +\vspace{5mm}
155 +\vfill
156 +
157 +\begin{tabular}{@{}r|llll|@{}} & Rna3Dmotifs & Rna3Dmotifs & RNA 3D Motif Atlas & RNA 3D Motif Atlas\\
158 + & + Direct P.M. & + BayesPairing & + JAR3D & + BayesPairing \\\hline
159 +PDB\_00376 & 0.72 (A,B) & 0.74 (B,C,D), 0.71 (A) & 0.74 (A,C,D), 0.72 (B) & 0.76 (\textit{all})\\
160 +PDB\_01023 & 0.79 (A,B) & 0.29 (\textit{all}) & 0.82 (\textit{all}) & 0.82 (\textit{all})\\
161 +PDB\_00857 & 0.97 (B), 0.77 (A) & 0.97 (\textit{all}) & 0.97 (\textit{all}), & 0.97 (\textit{all})\\
162 +\end{tabular}
163 +\end{table*}
164 +
165 +Detailed results are given below for each RNA. The number of solutions and computation times are also reported. Note that these cases are small RNAs, resulting in both small number of solutions and small times. The times are the "Real" time spent, therefore you should use the same 16-thread CPU to reproduce them, because there are several multi-threaded parts in the process. They also are very dependant of the I/O delays. Especially with methods reading modules from disk, you may want to use a very fast storage device (e.g. NVMe SSD NAND storage) to increase the speed.
166 +
167 +\newpage
168 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
169 +\subsection{E. coli's Gln tRNA}
170 +\paragraph{Sequence (FASTA format)} ~
171 +
172 +\texttt{>>'CRYSTAL STRUCTURE OF A TIGHT-BINDING GLUTAMINE TRNA BOUND TO GLUTAMINE AMINOACYL TRNA SYNTHETASE ' (PDB 00376)\\
173 +GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA}
174 +
175 +\paragraph{Referenced "true" structure in RNA-Strand (PDB 00376)} ~
176 +
177 +\texttt{((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....}
178 +
179 +\paragraph{Best prediction results} ~
180 +
181 +{\scriptsize
182 +\begin{tabular}{rlccr}
183 +Method & Best secondary structure & best MCC & N solutions & time (s)\\
184 +\hline
185 +True structure: & \texttt{((((((..(((.........)))((((((((...))))))))...(((((.......))))))))))).....} & & & \\
186 +RNAsubopt:& \texttt{(((((((.(((....)))..(((.(((((.......)))))..)))((((.......))))))))))).....} &0.68 &4 & 0.01\\
187 +Biokop :& \texttt{[[[[[[((((...))))...(((.((((([[[....)))))....(((((...]]].)))))]]]]]].))).}& 0.67 &30& 10.3\\
188 +RNA-MoIP :& \texttt{((((((..((......))...((.(((((.......)))))..))..((.........))..)))))).....}& 0.67 &4& 0.01+5.0\\
189 +Direct P.M.-A :& \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &8& 7.7\\
190 +Direct P.M.-B : & \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &11& 7.9\\
191 +BayesPairing-A:& \texttt{(((((((((((....)))......(((((.......))))).)).((((.........)))))))))).....} &0.71 &7& 74+9.9\\
192 +BayesPairing-B:& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &8& 74+15.5\\
193 +BayesPairing-C: & \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &9& 74+8.6\\
194 +BayesPairing-D: & \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &10& 74+9.2\\
195 +JAR3D-A : & \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &3& 0.01+1.3+7.9\\
196 +JAR3D-B : & \texttt{((((((((((...))))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.72 &3& 0.01+1.3+8.9\\
197 +JAR3D-C :& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &5& 0.01+1.3+7.7\\
198 +JAR3D-D :& \texttt{(((((((((((....)))......(((((.......))))).)).(((((.......))))))))))).....} &0.74 &5& 0.01+1.3+7.9\\
199 +BGSU-BPairing-A: & \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &6& 61+8.7\\
200 +BGSU-BPairing-B: & \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &10& 61+12.1\\
201 +BGSU-BPairing-C:& \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76 &4& 61+7.5\\
202 +BGSU-BPairing-D: & \texttt{((((((((.......))....((.(((((.......)))))..))(((((.......))))))))))).....} &0.76& 4& 61+7.2\\
203 +\end{tabular}}
204 +
205 +\paragraph{Notes} ~
206 +
207 +Note that BiokoP inserts a false-positive pseudoknot, while the Biorseo variants do not. If we look at our two recommended methods, here is an example of difference in the number of solutions : Biorseo+Rna3Dmotifs+Direct Pattern Matching + function B return 11 solutions, while Biorseo+The RNA 3D Motif Atlas+JAR3D+function B returns only 3, with the same best MCC of 0.72.
208 +
209 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
210 +\subsection{G Riboswitch}
211 +
212 +\paragraph{Sequence (FASTA format)} ~
213 +
214 +\texttt{> 'GUANINE RIBOSWITCH U22C, A52G MUTANT BOUND TO HYPOXANTHINE ' (PDB 01023)\\
215 +GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa}
216 +
217 +\paragraph{Referenced "true" structure in RNA-Strand (PDB 01023)} ~
218 +
219 +\texttt{(((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).}
220 +
221 +\paragraph{Best prediction results} ~
222 +
223 +{\scriptsize
224 +\begin{tabular}{rlccr}
225 +Method & Best secondary structure & best MCC & N solutions & time (s)\\
226 +\hline
227 +True structure: & \texttt{(((((((...(((((((.[[..[[)))))))........((((((]]...]]))))))..))))))).} & & & \\
228 +RNAsubopt: & \texttt{(((((((.....(((((.......)))))..........((((((.......))))))..))))))).} & 0.86 & 3 & 0.01 \\
229 +Biokop : & \texttt{(((((([[[.[[(((((]][[[[[))))).(((...[[[(((]]]]]]]]..]]])))))))))))).} & 0.59 & 4 & 58.2 \\
230 +RNA-MoIP : & \texttt{(((((((.....(((((.......)))))..........(((((.........)))))..))))))).} & 0.84 & 3 & 0.01+4.1\\
231 +Direct P.M.-A : & \texttt{(((((((.....(((((.......)))))..((....))(((((.........)))))..))))))).} & 0.79 & 15 & 4.3 \\
232 +Direct P.M.-B : & \texttt{((((.((.....(((((.......))))).((...))..((((((.......))))))..)).)))).} & 0.79 & 18 & 9.0 \\
233 +BayesPairing-A: & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 4 & 53+8.3\\
234 +BayesPairing-B: & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 4 & 53+8.4\\
235 +BayesPairing-C: & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 3 & 53+5.8\\
236 +BayesPairing-D: & \texttt{...............(((((((((.((....))......((((((.......)))))).)))))))))} & 0.29 & 3 & 53+5.6\\
237 +JAR3D-A : & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 4 & 0.01+1.2+30.3\\
238 +JAR3D-B : & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 5 & 0.01+1.2+23.8\\
239 +JAR3D-C : & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 2 & 0.01+1.2+4.7\\
240 +JAR3D-D : & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 2 & 0.01+1.2+4.6\\
241 +BGSU-BPairing-A: & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+6.2\\
242 +BGSU-BPairing-B: & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+8.4\\
243 +BGSU-BPairing-C: & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+6.7\\
244 +BGSU-BPairing-D: & \texttt{(((((((.....(((((.......)))))..((....))((((((.......))))))..))))))).} & 0.82 & 9 & 58+8.1\\
245 +\end{tabular}}
246 +
247 +\paragraph{Notes} ~
248 +
249 +On one side, here again, BiokoP predicts too many knots. The RNA only contains one kissing-hairpin HHH pseudoknot. On the other side, as expected, this HHH knot is not detected by Biorseo.
250 +
251 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
252 +\subsection{Human telomerase's RNA pseudoknot}
253 +
254 +\paragraph{Sequence (FASTA format)} ~
255 +
256 +\texttt{> 'SOLUTION STRUCTURE OF THE P2B-P3 PSEUDOKNOT FROM HUMAN TELOMERASE RNA ' (PDB 00857)\\
257 +GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA}
258 +
259 +\paragraph{Referenced "true" structure in RNA-Strand (PDB 00857)} ~
260 +
261 +\texttt{[[[[[[........(((((((((]]]]]]........))))))))).}
262 +
263 +\paragraph{Best prediction results} ~
264 +
265 +{\scriptsize
266 +\begin{tabular}{rlccr}
267 +Method & Best secondary structure & best MCC & N solutions & time (s)\\
268 +\hline
269 +True structure: & \texttt{[[[[[[........(((((((((]]]]]]........)))))))))} & & & \\
270 +RNAsubopt: & \texttt{..............(((((((((..............))))))))).} & 0.77 & 3 & 0.06\\
271 +Biokop : & \texttt{[[[[[[........(((((((((]]]]]]........))))))))).} & 1.00 & 1 & 4.7\\
272 +RNA-MoIP : & \texttt{..............(((((((((..............))))))))).} & 0.77 & 3 & 0.06+3.3\\
273 +Direct P.M.-A : & \texttt{..............(((((((((..............))))))))).} & 0.77 & 3 & 0.8\\
274 +Direct P.M.-B : & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 7 & 0.7\\
275 +BayesPairing-A: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 71+1.0\\
276 +BayesPairing-B: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 71+0.6\\
277 +BayesPairing-C: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 71+0.6\\
278 +BayesPairing-D: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 71+0.6\\
279 +JAR3D-A : & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 0.06+1.3+0.8\\
280 +JAR3D-B : & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 0.06+1.3+0.6\\
281 +JAR3D-C : & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 0.06+1.3+0.6\\
282 +JAR3D-D : & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 0.06+1.3+0.6\\
283 +BGSU-BPairing-A: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 57.7+0.5\\
284 +BGSU-BPairing-B: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 57.7+0.5\\
285 +BGSU-BPairing-C: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 57.7+0.5\\
286 +BGSU-BPairing-D: & \texttt{[[[[[[........((((((((.]]]]]].........)))))))).} & 0.97 & 2 & 57.7+0.5\\
287 +\end{tabular}}
288 +
289 +
290 +\paragraph{Notes} ~
291 +
292 +The methods which support pseudoknots are able to predict it correctly.
117 \end{document} 293 \end{document}
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -315,7 +315,7 @@ def launch_BayesPairing(module_type, seq_, header_, basename): ...@@ -315,7 +315,7 @@ def launch_BayesPairing(module_type, seq_, header_, basename):
315 rna.close() 315 rna.close()
316 316
317 def launch_RNAMoIP_worker(x): 317 def launch_RNAMoIP_worker(x):
318 - RNAMoIP = "../RNAMoIP/RNAMoIP.py" 318 + RNAMoIP = biorseoDir + "/../RNAMoIP/Src/RNAMoIP.py"
319 logfile = open("log_of_the_run.sh", 'a') 319 logfile = open("log_of_the_run.sh", 'a')
320 logfile.write(" ".join(["gurobi.sh", RNAMoIP, "-s", '"' +x[1]+'"', "-ss", '"'+x[0].strip()+'"', "-d", descfolder])) 320 logfile.write(" ".join(["gurobi.sh", RNAMoIP, "-s", '"' +x[1]+'"', "-ss", '"'+x[0].strip()+'"', "-d", descfolder]))
321 logfile.write("\n") 321 logfile.write("\n")
...@@ -369,14 +369,6 @@ def launch_RNAMoIP(seq_, header_, basename): ...@@ -369,14 +369,6 @@ def launch_RNAMoIP(seq_, header_, basename):
369 rna.write(p+'\t'+str(n)+'\t'+str(s)+'\n') 369 rna.write(p+'\t'+str(n)+'\t'+str(s)+'\n')
370 rna.close() 370 rna.close()
371 371
372 -def launch_pKiss(seq_, header_, basename):
373 - json = "{\"pkiss_input_rna_sequences\":\">%s\r\n%s\",\"paramset\":{\"pkiss_parameter_absoluteDeviation\":\"0.5\",\"pkiss_parameter_maxKnotSize\":\"3.0\",\"pkiss_parameter_windowSize\":\"1.0\",\"pkiss_parameter_param\":\"rna_andronescu2007\"}}" %(header_, seq_)
374 - cmd = "curl -X POST -d @[[%s]] http://bibiserv2.cebitec.uni-bielefeld.de:80/rest/pkiss/pkiss_function_subopt/request -H \"Content-Type: application/json\"" % json
375 - logfile = open("log_of_the_run.sh", 'a')
376 - logfile.write(cmd+"\n")
377 - logfile.close()
378 - print(cmd)
379 -
380 def mattews_corr_coeff(tp, tn, fp, fn): 372 def mattews_corr_coeff(tp, tn, fp, fn):
381 if (tp+fp == 0): 373 if (tp+fp == 0):
382 print("We have an issue : no positives detected ! (linear structure)") 374 print("We have an issue : no positives detected ! (linear structure)")
......