Louis BECQUEY

cleaned scripts/ and figures/ folders

What are this RNA data files ?
===============================
## Raw (big) databases
* RNA-Strand 2.0 (secondary_structures_database.dbn) : this file is a dataset supposed to be identical to RNA-Strand 2.0 (actually the file is present on IBISC machines for years now and nobody remembers how it was built). The former RNA Strand website is not online anymore (http://rnasoft.ca/strand).
* bpRNA-1m_90 : this huge database gathers the data from other databases (CRW, PDB, Rfam, RNP, SPR, SRP, ...) and superseeds RNA-Strand (minus the structures that are only in NDB, sadly). Sequences have been prefiltered to have no more than 90% identity. Source : http://bprna.cgrb.oregonstate.edu/
* Pseudobase(++) : A database of biologically validated pseudoknots, from the time discovering a pseudoknot was something unusual. Pseudobase stays famous for its pseudoknot classification scheme. I scraped it myself to build the file. Source : https://www.ekevanbatenburg.nl/PKBASE/PKB.HTML
## Filtered databases
* verified_secondary_structures.dbn : The subset of RNA-Strand that was experimentally validated (basically, the ones for which a 3D structure was available, so the ones from NDB and PDB).
* The _short.dbn ones : Same as its parent, but filtered using the filter.py script.
* pseudoknots.dbn : Audrey Legendre's scrap of Pseudobase, which, for an unknow reason, does not contain all the available data, but nice descriptions of what the RNAs are.
## Small test databases
* RNA-MoIP dataset : The cherry-picked cases presented in Reinhartz et al. 2012 to show RNA-MoIP's performance.
* applications.dbn : My cherry-picked cases presented in Becquey et al. 2020 to show Biorseo's performance.
* example.dbn : an example database with only one RNA, for testing purposes
* nothing.dbn : an example database with no RNAs, for testing purposes
Enjoy benchmarking RNA structure prediction tools.
\ No newline at end of file

99.5 KB | W: | H:

156 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin

32.4 KB | W: | H:

30.4 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
......@@ -158,7 +158,6 @@ def is_canonical_nts(seq):
return False
return True
def is_canonical_bps(struct):
if "()" in struct:
return False
......@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3):
db.close()
return container, pkcounter
def parse_biokop(folder, basename, ext=".biok"):
solutions = []
err = 0
......@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"):
err = 1
return None, err
def parse_biorseo(folder, basename, ext):
solutions = []
err = 0
......@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext):
err = 1
return None, err
def prettify_biorseo(code):
name = ""
if "bgsu" in code:
name += "RNA 3D Motif Atlas + "
if "json" in code:
name += "JSON motifs + "
elif "rin" in code:
name += "CaRNAval + "
else:
name += "Rna3Dmotifs + "
if "raw" in code:
name += "Direct P.M."
if "byp" in code:
name += "BPairing"
if "jar3d" in code:
name += "Jar3d"
# name += " + $f_{1" + code[-1] + "}$"
return name
......@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
if __name__ == "__main__":
try:
opts, args = getopt.getopt( sys.argv[1:], "",
[ "biorseo_desc_byp_A", "biorseo_desc_byp_B",
"biorseo_desc_byp_C", "biorseo_desc_byp_D",
"biorseo_bgsu_byp_A", "biorseo_bgsu_byp_B",
"biorseo_bgsu_byp_C", "biorseo_bgsu_byp_D",
"biorseo_desc_raw_A", "biorseo_desc_raw_B",
"biorseo_bgsu_jar3d_A", "biorseo_bgsu_jar3d_B",
"biorseo_bgsu_jar3d_C", "biorseo_bgsu_jar3d_D",
"biorseo_rin_raw_A", "biorseo_rin_raw_B",
[ "biorseo_desc_A", "biorseo_desc_B",
"biorseo_rin_A", "biorseo_rin_B",
"biorseo_json_A", "biorseo_json_B",
"biokop", "folder=", "database=", "output="
])
except getopt.GetoptError as err:
......@@ -384,36 +369,19 @@ if __name__ == "__main__":
if extension == "all":
parse = parse_biorseo
fig, ax = plt.subplots(4,5,figsize=(12,10), sharex=True, sharey=True)
fig, ax = plt.subplots(2,3,figsize=(8,10), sharex=True, sharey=True)
ax = ax.flatten()
process_extension(ax, 0, ".biorseo_desc_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 1, ".biorseo_rin_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 2, ".biorseo_desc_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 3, ".biorseo_bgsu_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 4, ".biorseo_bgsu_jar3d_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
ax[0].set_title(prettify_biorseo("biorseo_desc_raw_A"), fontsize=10)
ax[1].set_title(prettify_biorseo("biorseo_rin_raw_A"), fontsize=10)
ax[2].set_title(prettify_biorseo("biorseo_desc_byp_A"), fontsize=10)
ax[3].set_title(prettify_biorseo("biorseo_bgsu_byp_A"), fontsize=10)
ax[4].set_title(prettify_biorseo("biorseo_bgsu_jar3d_A"), fontsize=10)
process_extension(ax, 5, ".biorseo_desc_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 6, ".biorseo_rin_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 7, ".biorseo_desc_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 8, ".biorseo_bgsu_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 9, ".biorseo_bgsu_jar3d_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 12, ".biorseo_desc_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
process_extension(ax, 13, ".biorseo_bgsu_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
process_extension(ax, 14, ".biorseo_bgsu_jar3d_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
ax[10].axis("off")
ax[11].axis("off")
process_extension(ax, 17, ".biorseo_desc_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
process_extension(ax, 18, ".biorseo_bgsu_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
process_extension(ax, 19, ".biorseo_bgsu_jar3d_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
ax[15].axis("off")
ax[16].axis("off")
process_extension(ax, 0, ".biorseo_desc_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 1, ".biorseo_rin_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 2, ".biorseo_json_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
ax[0].set_title(prettify_biorseo("biorseo_desc_A"), fontsize=10)
ax[1].set_title(prettify_biorseo("biorseo_rin_A"), fontsize=10)
ax[2].set_title(prettify_biorseo("biorseo_json_A"), fontsize=10)
process_extension(ax, 3, ".biorseo_desc_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 4, ".biorseo_rin_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 5, ".biorseo_json_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
for a in ax:
a.label_outer()
plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05)
......
This diff is collapsed. Click to expand it.