Louis BECQUEY

cleaned scripts/ and figures/ folders

1 +What are this RNA data files ?
2 +===============================
3 +
4 +## Raw (big) databases
5 +* RNA-Strand 2.0 (secondary_structures_database.dbn) : this file is a dataset supposed to be identical to RNA-Strand 2.0 (actually the file is present on IBISC machines for years now and nobody remembers how it was built). The former RNA Strand website is not online anymore (http://rnasoft.ca/strand).
6 +* bpRNA-1m_90 : this huge database gathers the data from other databases (CRW, PDB, Rfam, RNP, SPR, SRP, ...) and superseeds RNA-Strand (minus the structures that are only in NDB, sadly). Sequences have been prefiltered to have no more than 90% identity. Source : http://bprna.cgrb.oregonstate.edu/
7 +* Pseudobase(++) : A database of biologically validated pseudoknots, from the time discovering a pseudoknot was something unusual. Pseudobase stays famous for its pseudoknot classification scheme. I scraped it myself to build the file. Source : https://www.ekevanbatenburg.nl/PKBASE/PKB.HTML
8 +
9 +
10 +## Filtered databases
11 +* verified_secondary_structures.dbn : The subset of RNA-Strand that was experimentally validated (basically, the ones for which a 3D structure was available, so the ones from NDB and PDB).
12 +* The _short.dbn ones : Same as its parent, but filtered using the filter.py script.
13 +* pseudoknots.dbn : Audrey Legendre's scrap of Pseudobase, which, for an unknow reason, does not contain all the available data, but nice descriptions of what the RNAs are.
14 +
15 +
16 +## Small test databases
17 +* RNA-MoIP dataset : The cherry-picked cases presented in Reinhartz et al. 2012 to show RNA-MoIP's performance.
18 +* applications.dbn : My cherry-picked cases presented in Becquey et al. 2020 to show Biorseo's performance.
19 +* example.dbn : an example database with only one RNA, for testing purposes
20 +* nothing.dbn : an example database with no RNAs, for testing purposes
21 +
22 +
23 +Enjoy benchmarking RNA structure prediction tools.
...\ No newline at end of file ...\ No newline at end of file

99.5 KB | W: | H:

156 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin

32.4 KB | W: | H:

30.4 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
...@@ -158,7 +158,6 @@ def is_canonical_nts(seq): ...@@ -158,7 +158,6 @@ def is_canonical_nts(seq):
158 return False 158 return False
159 return True 159 return True
160 160
161 -
162 def is_canonical_bps(struct): 161 def is_canonical_bps(struct):
163 if "()" in struct: 162 if "()" in struct:
164 return False 163 return False
...@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3): ...@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3):
207 db.close() 206 db.close()
208 return container, pkcounter 207 return container, pkcounter
209 208
210 -
211 def parse_biokop(folder, basename, ext=".biok"): 209 def parse_biokop(folder, basename, ext=".biok"):
212 solutions = [] 210 solutions = []
213 err = 0 211 err = 0
...@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"): ...@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"):
248 err = 1 246 err = 1
249 return None, err 247 return None, err
250 248
251 -
252 def parse_biorseo(folder, basename, ext): 249 def parse_biorseo(folder, basename, ext):
253 solutions = [] 250 solutions = []
254 err = 0 251 err = 0
...@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext): ...@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext):
272 err = 1 269 err = 1
273 return None, err 270 return None, err
274 271
275 -
276 def prettify_biorseo(code): 272 def prettify_biorseo(code):
277 name = "" 273 name = ""
278 - if "bgsu" in code: 274 + if "json" in code:
279 - name += "RNA 3D Motif Atlas + " 275 + name += "JSON motifs + "
280 elif "rin" in code: 276 elif "rin" in code:
281 name += "CaRNAval + " 277 name += "CaRNAval + "
282 else: 278 else:
283 name += "Rna3Dmotifs + " 279 name += "Rna3Dmotifs + "
284 - if "raw" in code:
285 - name += "Direct P.M."
286 - if "byp" in code:
287 - name += "BPairing"
288 - if "jar3d" in code:
289 - name += "Jar3d"
290 # name += " + $f_{1" + code[-1] + "}$" 280 # name += " + $f_{1" + code[-1] + "}$"
291 return name 281 return name
292 282
...@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf ...@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
342 if __name__ == "__main__": 332 if __name__ == "__main__":
343 try: 333 try:
344 opts, args = getopt.getopt( sys.argv[1:], "", 334 opts, args = getopt.getopt( sys.argv[1:], "",
345 - [ "biorseo_desc_byp_A", "biorseo_desc_byp_B", 335 + [ "biorseo_desc_A", "biorseo_desc_B",
346 - "biorseo_desc_byp_C", "biorseo_desc_byp_D", 336 + "biorseo_rin_A", "biorseo_rin_B",
347 - "biorseo_bgsu_byp_A", "biorseo_bgsu_byp_B", 337 + "biorseo_json_A", "biorseo_json_B",
348 - "biorseo_bgsu_byp_C", "biorseo_bgsu_byp_D",
349 - "biorseo_desc_raw_A", "biorseo_desc_raw_B",
350 - "biorseo_bgsu_jar3d_A", "biorseo_bgsu_jar3d_B",
351 - "biorseo_bgsu_jar3d_C", "biorseo_bgsu_jar3d_D",
352 - "biorseo_rin_raw_A", "biorseo_rin_raw_B",
353 "biokop", "folder=", "database=", "output=" 338 "biokop", "folder=", "database=", "output="
354 ]) 339 ])
355 except getopt.GetoptError as err: 340 except getopt.GetoptError as err:
...@@ -384,36 +369,19 @@ if __name__ == "__main__": ...@@ -384,36 +369,19 @@ if __name__ == "__main__":
384 369
385 if extension == "all": 370 if extension == "all":
386 parse = parse_biorseo 371 parse = parse_biorseo
387 - fig, ax = plt.subplots(4,5,figsize=(12,10), sharex=True, sharey=True) 372 + fig, ax = plt.subplots(2,3,figsize=(8,10), sharex=True, sharey=True)
388 ax = ax.flatten() 373 ax = ax.flatten()
389 - process_extension(ax, 0, ".biorseo_desc_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 374 + process_extension(ax, 0, ".biorseo_desc_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
390 - process_extension(ax, 1, ".biorseo_rin_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 375 + process_extension(ax, 1, ".biorseo_rin_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
391 - process_extension(ax, 2, ".biorseo_desc_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 376 + process_extension(ax, 2, ".biorseo_json_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
392 - process_extension(ax, 3, ".biorseo_bgsu_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 377 + ax[0].set_title(prettify_biorseo("biorseo_desc_A"), fontsize=10)
393 - process_extension(ax, 4, ".biorseo_bgsu_jar3d_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 378 + ax[1].set_title(prettify_biorseo("biorseo_rin_A"), fontsize=10)
394 - ax[0].set_title(prettify_biorseo("biorseo_desc_raw_A"), fontsize=10) 379 + ax[2].set_title(prettify_biorseo("biorseo_json_A"), fontsize=10)
395 - ax[1].set_title(prettify_biorseo("biorseo_rin_raw_A"), fontsize=10) 380 +
396 - ax[2].set_title(prettify_biorseo("biorseo_desc_byp_A"), fontsize=10) 381 + process_extension(ax, 3, ".biorseo_desc_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
397 - ax[3].set_title(prettify_biorseo("biorseo_bgsu_byp_A"), fontsize=10) 382 + process_extension(ax, 4, ".biorseo_rin_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
398 - ax[4].set_title(prettify_biorseo("biorseo_bgsu_jar3d_A"), fontsize=10) 383 + process_extension(ax, 5, ".biorseo_json_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
399 - 384 +
400 - process_extension(ax, 5, ".biorseo_desc_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
401 - process_extension(ax, 6, ".biorseo_rin_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
402 - process_extension(ax, 7, ".biorseo_desc_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
403 - process_extension(ax, 8, ".biorseo_bgsu_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
404 - process_extension(ax, 9, ".biorseo_bgsu_jar3d_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
405 -
406 - process_extension(ax, 12, ".biorseo_desc_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
407 - process_extension(ax, 13, ".biorseo_bgsu_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
408 - process_extension(ax, 14, ".biorseo_bgsu_jar3d_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
409 - ax[10].axis("off")
410 - ax[11].axis("off")
411 -
412 - process_extension(ax, 17, ".biorseo_desc_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
413 - process_extension(ax, 18, ".biorseo_bgsu_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
414 - process_extension(ax, 19, ".biorseo_bgsu_jar3d_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
415 - ax[15].axis("off")
416 - ax[16].axis("off")
417 for a in ax: 385 for a in ax:
418 a.label_outer() 386 a.label_outer()
419 plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05) 387 plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05)
......
This diff is collapsed. Click to expand it.