Showing
11 changed files
with
40 additions
and
570 deletions
data/sec_structs/Readme.md
0 → 100644
1 | +What are this RNA data files ? | ||
2 | +=============================== | ||
3 | + | ||
4 | +## Raw (big) databases | ||
5 | +* RNA-Strand 2.0 (secondary_structures_database.dbn) : this file is a dataset supposed to be identical to RNA-Strand 2.0 (actually the file is present on IBISC machines for years now and nobody remembers how it was built). The former RNA Strand website is not online anymore (http://rnasoft.ca/strand). | ||
6 | +* bpRNA-1m_90 : this huge database gathers the data from other databases (CRW, PDB, Rfam, RNP, SPR, SRP, ...) and superseeds RNA-Strand (minus the structures that are only in NDB, sadly). Sequences have been prefiltered to have no more than 90% identity. Source : http://bprna.cgrb.oregonstate.edu/ | ||
7 | +* Pseudobase(++) : A database of biologically validated pseudoknots, from the time discovering a pseudoknot was something unusual. Pseudobase stays famous for its pseudoknot classification scheme. I scraped it myself to build the file. Source : https://www.ekevanbatenburg.nl/PKBASE/PKB.HTML | ||
8 | + | ||
9 | + | ||
10 | +## Filtered databases | ||
11 | +* verified_secondary_structures.dbn : The subset of RNA-Strand that was experimentally validated (basically, the ones for which a 3D structure was available, so the ones from NDB and PDB). | ||
12 | +* The _short.dbn ones : Same as its parent, but filtered using the filter.py script. | ||
13 | +* pseudoknots.dbn : Audrey Legendre's scrap of Pseudobase, which, for an unknow reason, does not contain all the available data, but nice descriptions of what the RNAs are. | ||
14 | + | ||
15 | + | ||
16 | +## Small test databases | ||
17 | +* RNA-MoIP dataset : The cherry-picked cases presented in Reinhartz et al. 2012 to show RNA-MoIP's performance. | ||
18 | +* applications.dbn : My cherry-picked cases presented in Becquey et al. 2020 to show Biorseo's performance. | ||
19 | +* example.dbn : an example database with only one RNA, for testing purposes | ||
20 | +* nothing.dbn : an example database with no RNAs, for testing purposes | ||
21 | + | ||
22 | + | ||
23 | +Enjoy benchmarking RNA structure prediction tools. | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
figures/number_of_solutions.png
0 → 100644
11.9 KB
37.2 KB
38.2 KB
36.1 KB
37 KB
File moved
... | @@ -158,7 +158,6 @@ def is_canonical_nts(seq): | ... | @@ -158,7 +158,6 @@ def is_canonical_nts(seq): |
158 | return False | 158 | return False |
159 | return True | 159 | return True |
160 | 160 | ||
161 | - | ||
162 | def is_canonical_bps(struct): | 161 | def is_canonical_bps(struct): |
163 | if "()" in struct: | 162 | if "()" in struct: |
164 | return False | 163 | return False |
... | @@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3): | ... | @@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3): |
207 | db.close() | 206 | db.close() |
208 | return container, pkcounter | 207 | return container, pkcounter |
209 | 208 | ||
210 | - | ||
211 | def parse_biokop(folder, basename, ext=".biok"): | 209 | def parse_biokop(folder, basename, ext=".biok"): |
212 | solutions = [] | 210 | solutions = [] |
213 | err = 0 | 211 | err = 0 |
... | @@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"): | ... | @@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"): |
248 | err = 1 | 246 | err = 1 |
249 | return None, err | 247 | return None, err |
250 | 248 | ||
251 | - | ||
252 | def parse_biorseo(folder, basename, ext): | 249 | def parse_biorseo(folder, basename, ext): |
253 | solutions = [] | 250 | solutions = [] |
254 | err = 0 | 251 | err = 0 |
... | @@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext): | ... | @@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext): |
272 | err = 1 | 269 | err = 1 |
273 | return None, err | 270 | return None, err |
274 | 271 | ||
275 | - | ||
276 | def prettify_biorseo(code): | 272 | def prettify_biorseo(code): |
277 | name = "" | 273 | name = "" |
278 | - if "bgsu" in code: | 274 | + if "json" in code: |
279 | - name += "RNA 3D Motif Atlas + " | 275 | + name += "JSON motifs + " |
280 | elif "rin" in code: | 276 | elif "rin" in code: |
281 | name += "CaRNAval + " | 277 | name += "CaRNAval + " |
282 | else: | 278 | else: |
283 | name += "Rna3Dmotifs + " | 279 | name += "Rna3Dmotifs + " |
284 | - if "raw" in code: | ||
285 | - name += "Direct P.M." | ||
286 | - if "byp" in code: | ||
287 | - name += "BPairing" | ||
288 | - if "jar3d" in code: | ||
289 | - name += "Jar3d" | ||
290 | # name += " + $f_{1" + code[-1] + "}$" | 280 | # name += " + $f_{1" + code[-1] + "}$" |
291 | return name | 281 | return name |
292 | 282 | ||
... | @@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf | ... | @@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf |
342 | if __name__ == "__main__": | 332 | if __name__ == "__main__": |
343 | try: | 333 | try: |
344 | opts, args = getopt.getopt( sys.argv[1:], "", | 334 | opts, args = getopt.getopt( sys.argv[1:], "", |
345 | - [ "biorseo_desc_byp_A", "biorseo_desc_byp_B", | 335 | + [ "biorseo_desc_A", "biorseo_desc_B", |
346 | - "biorseo_desc_byp_C", "biorseo_desc_byp_D", | 336 | + "biorseo_rin_A", "biorseo_rin_B", |
347 | - "biorseo_bgsu_byp_A", "biorseo_bgsu_byp_B", | 337 | + "biorseo_json_A", "biorseo_json_B", |
348 | - "biorseo_bgsu_byp_C", "biorseo_bgsu_byp_D", | ||
349 | - "biorseo_desc_raw_A", "biorseo_desc_raw_B", | ||
350 | - "biorseo_bgsu_jar3d_A", "biorseo_bgsu_jar3d_B", | ||
351 | - "biorseo_bgsu_jar3d_C", "biorseo_bgsu_jar3d_D", | ||
352 | - "biorseo_rin_raw_A", "biorseo_rin_raw_B", | ||
353 | "biokop", "folder=", "database=", "output=" | 338 | "biokop", "folder=", "database=", "output=" |
354 | ]) | 339 | ]) |
355 | except getopt.GetoptError as err: | 340 | except getopt.GetoptError as err: |
... | @@ -384,36 +369,19 @@ if __name__ == "__main__": | ... | @@ -384,36 +369,19 @@ if __name__ == "__main__": |
384 | 369 | ||
385 | if extension == "all": | 370 | if extension == "all": |
386 | parse = parse_biorseo | 371 | parse = parse_biorseo |
387 | - fig, ax = plt.subplots(4,5,figsize=(12,10), sharex=True, sharey=True) | 372 | + fig, ax = plt.subplots(2,3,figsize=(8,10), sharex=True, sharey=True) |
388 | ax = ax.flatten() | 373 | ax = ax.flatten() |
389 | - process_extension(ax, 0, ".biorseo_desc_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") | 374 | + process_extension(ax, 0, ".biorseo_desc_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") |
390 | - process_extension(ax, 1, ".biorseo_rin_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") | 375 | + process_extension(ax, 1, ".biorseo_rin_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") |
391 | - process_extension(ax, 2, ".biorseo_desc_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") | 376 | + process_extension(ax, 2, ".biorseo_json_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") |
392 | - process_extension(ax, 3, ".biorseo_bgsu_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") | 377 | + ax[0].set_title(prettify_biorseo("biorseo_desc_A"), fontsize=10) |
393 | - process_extension(ax, 4, ".biorseo_bgsu_jar3d_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") | 378 | + ax[1].set_title(prettify_biorseo("biorseo_rin_A"), fontsize=10) |
394 | - ax[0].set_title(prettify_biorseo("biorseo_desc_raw_A"), fontsize=10) | 379 | + ax[2].set_title(prettify_biorseo("biorseo_json_A"), fontsize=10) |
395 | - ax[1].set_title(prettify_biorseo("biorseo_rin_raw_A"), fontsize=10) | 380 | + |
396 | - ax[2].set_title(prettify_biorseo("biorseo_desc_byp_A"), fontsize=10) | 381 | + process_extension(ax, 3, ".biorseo_desc_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") |
397 | - ax[3].set_title(prettify_biorseo("biorseo_bgsu_byp_A"), fontsize=10) | 382 | + process_extension(ax, 4, ".biorseo_rin_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") |
398 | - ax[4].set_title(prettify_biorseo("biorseo_bgsu_jar3d_A"), fontsize=10) | 383 | + process_extension(ax, 5, ".biorseo_json_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") |
399 | - | 384 | + |
400 | - process_extension(ax, 5, ".biorseo_desc_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") | ||
401 | - process_extension(ax, 6, ".biorseo_rin_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") | ||
402 | - process_extension(ax, 7, ".biorseo_desc_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") | ||
403 | - process_extension(ax, 8, ".biorseo_bgsu_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") | ||
404 | - process_extension(ax, 9, ".biorseo_bgsu_jar3d_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA") | ||
405 | - | ||
406 | - process_extension(ax, 12, ".biorseo_desc_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA") | ||
407 | - process_extension(ax, 13, ".biorseo_bgsu_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA") | ||
408 | - process_extension(ax, 14, ".biorseo_bgsu_jar3d_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA") | ||
409 | - ax[10].axis("off") | ||
410 | - ax[11].axis("off") | ||
411 | - | ||
412 | - process_extension(ax, 17, ".biorseo_desc_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA") | ||
413 | - process_extension(ax, 18, ".biorseo_bgsu_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA") | ||
414 | - process_extension(ax, 19, ".biorseo_bgsu_jar3d_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA") | ||
415 | - ax[15].axis("off") | ||
416 | - ax[16].axis("off") | ||
417 | for a in ax: | 385 | for a in ax: |
418 | a.label_outer() | 386 | a.label_outer() |
419 | plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05) | 387 | plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05) | ... | ... |
scripts/pareto_visualizer_json.py
deleted
100644 → 0
1 | -#!/usr/bin/python3 | ||
2 | -# Created by Louis Becquey, louis.becquey@univ-evry.fr, Oct 2019 | ||
3 | -# This script processes files containing RNA structures obtained from bi-objective | ||
4 | -# optimization programs, and a dot-bracket database of reference structures, to plot | ||
5 | -# where are the best solutions in the Pareto set. | ||
6 | -# | ||
7 | -# The result files should follow this kind of format: | ||
8 | -# for Biokop: (option --biokop) | ||
9 | -# Structure Free energy score Expected accuracy score | ||
10 | -# (((...(((...)))))) <tab> obj1_value <tab> obj2_value | ||
11 | -# (((............))) <tab> obj1_value <tab> obj2_value | ||
12 | -# ((((((...)))...))) <tab> obj1_value <tab> obj2_value | ||
13 | -# ... | ||
14 | -# | ||
15 | -# for BiORSEO: (options --biorseo_**stuff**) | ||
16 | -# >Header of the sequence | ||
17 | -# GGCACAGAGUUAUGUGCC | ||
18 | -# (((...(((...)))))) + Motif1 + Motif2 <tab> obj1_value <tab> obj2_value | ||
19 | -# (((............))) <tab> obj1_value <tab> obj2_value | ||
20 | -# ((((((...)))...))) + Motif1 <tab> obj1_value <tab> obj2_value | ||
21 | -# | ||
22 | -# typical Biokop usage: | ||
23 | -# python3 pareto_visualizer.py --biokop --folder path/to/your/results/folder --database path/to/the/database_file.dbn | ||
24 | -# typical Biorseo usage: | ||
25 | -# python3 pareto_visualizer_json.py --folder path/to/your/results/folder (pmE et pmF) --database path/to/the/database_file.dbn (nom, sequence, structure) | ||
26 | -# | ||
27 | - | ||
28 | -from math import sqrt | ||
29 | -import numpy as np | ||
30 | -import matplotlib.pyplot as plt | ||
31 | -from matplotlib import cm | ||
32 | -import scipy.stats as st | ||
33 | -import sys | ||
34 | -import os | ||
35 | -import subprocess | ||
36 | -import getopt | ||
37 | - | ||
38 | -class SecStruct: | ||
39 | - def __init__(self, name, dot_bracket, contacts, obj1_value, obj2_value): | ||
40 | - self.name = name | ||
41 | - self.dbn = dot_bracket | ||
42 | - self.ctc = contacts | ||
43 | - self.objectives = [ obj1_value, obj2_value ] | ||
44 | - self.basepair_list = self.get_basepairs() | ||
45 | - self.length = len(dot_bracket) | ||
46 | - | ||
47 | - def get_basepairs(self): | ||
48 | - parenthesis = [] | ||
49 | - brackets = [] | ||
50 | - braces = [] | ||
51 | - rafters = [] | ||
52 | - basepairs = [] | ||
53 | - As = [] | ||
54 | - Bs = [] | ||
55 | - for i, c in enumerate(self.dbn): | ||
56 | - if c == '(': | ||
57 | - parenthesis.append(i) | ||
58 | - if c == '[': | ||
59 | - brackets.append(i) | ||
60 | - if c == '{': | ||
61 | - braces.append(i) | ||
62 | - if c == '<': | ||
63 | - rafters.append(i) | ||
64 | - if c == 'A': | ||
65 | - As.append(i) | ||
66 | - if c == 'B': | ||
67 | - Bs.append(i) | ||
68 | - if c == '.': | ||
69 | - continue | ||
70 | - if c == ')': | ||
71 | - basepairs.append((i, parenthesis.pop())) | ||
72 | - if c == ']': | ||
73 | - basepairs.append((i, brackets.pop())) | ||
74 | - if c == '}': | ||
75 | - basepairs.append((i, braces.pop())) | ||
76 | - if c == '>': | ||
77 | - basepairs.append((i, rafters.pop())) | ||
78 | - if c == 'a': | ||
79 | - basepairs.append((i, As.pop())) | ||
80 | - if c == 'b': | ||
81 | - basepairs.append((i, Bs.pop())) | ||
82 | - return basepairs | ||
83 | - | ||
84 | - def get_MCC_with(self, reference_structure): | ||
85 | - # Get true and false positives and negatives | ||
86 | - tp = 0 | ||
87 | - fp = 0 | ||
88 | - tn = 0 | ||
89 | - fn = 0 | ||
90 | - for bp in reference_structure.basepair_list: | ||
91 | - if bp in self.basepair_list: | ||
92 | - tp += 1 | ||
93 | - else: | ||
94 | - fn += 1 | ||
95 | - for bp in self.basepair_list: | ||
96 | - if bp not in reference_structure.basepair_list: | ||
97 | - fp += 1 | ||
98 | - tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp | ||
99 | - | ||
100 | - # Compute MCC | ||
101 | - if (tp+fp == 0): | ||
102 | - print("We have an issue : no positives detected ! (linear structure)") | ||
103 | - return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) | ||
104 | - | ||
105 | - def get_MCC_ctc_with(self, reference_structure): | ||
106 | - # Get true and false positives and negatives | ||
107 | - tp = 0 | ||
108 | - fp = 0 | ||
109 | - tn = 0 | ||
110 | - fn = 0 | ||
111 | - prediction = self.ctc | ||
112 | - true_ctc = reference_structure.ctc | ||
113 | - for i in range(len(true_ctc)): | ||
114 | - if true_ctc[i] == '*' and prediction[i] == '*': | ||
115 | - tp += 1 | ||
116 | - elif true_ctc[i] == '.' and prediction[i] == '.': | ||
117 | - tn += 1 | ||
118 | - elif true_ctc[i] == '.' and prediction[i] == '*': | ||
119 | - fp += 1 | ||
120 | - elif true_ctc[i] == '*' and prediction[i] == '.': | ||
121 | - fn += 1 | ||
122 | - # print(str(tp) + " " + str(tn) + " " + str(fp) + " " + str(fn) + "\n") | ||
123 | - | ||
124 | - result = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) | ||
125 | - # Compute MCC | ||
126 | - if ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0): | ||
127 | - print("warning: division by zero!") | ||
128 | - return None | ||
129 | - elif (tp + fp == 0): | ||
130 | - print("We have an issue : no positives detected ! (linear structure)") | ||
131 | - return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) | ||
132 | - | ||
133 | -class Pareto: | ||
134 | - def __init__(self, list_of_structs, reference): | ||
135 | - self.predictions = list_of_structs | ||
136 | - self.true_structure = reference | ||
137 | - self.n_pred = len(list_of_structs) | ||
138 | - self.max_obj1 = max([s.objectives[0] for s in self.predictions ]) | ||
139 | - self.max_obj2 = max([s.objectives[1] for s in self.predictions ]) | ||
140 | - self.index_of_best = self.find_best_solution() | ||
141 | - self.index_of_best_ctc = self.find_best_solution_ctc() | ||
142 | - | ||
143 | - def find_best_solution(self): | ||
144 | - # returns the index of the solution of the Pareto set which is the closest | ||
145 | - # to the real 2D structure (the one with the max MCC) | ||
146 | - max_i = -1 | ||
147 | - max_mcc = -1 | ||
148 | - for i,s in enumerate(self.predictions): | ||
149 | - mcc = s.get_MCC_with(self.true_structure) | ||
150 | - if mcc > max_mcc: | ||
151 | - max_mcc = mcc | ||
152 | - max_i = i | ||
153 | - print("\n" + "max mcc str: " + str(max_mcc)) | ||
154 | - return max_i | ||
155 | - | ||
156 | - def find_best_solution_ctc(self): | ||
157 | - # returns the index of the solution of the Pareto set which is the closest | ||
158 | - # to the real contacts area (the one with the max MCC) | ||
159 | - max_i = -1 | ||
160 | - max_mcc = -1 | ||
161 | - for i,s in enumerate(self.predictions): | ||
162 | - mcc = s.get_MCC_ctc_with(self.true_structure) | ||
163 | - if mcc is None: | ||
164 | - continue | ||
165 | - elif mcc > max_mcc: | ||
166 | - max_mcc = mcc | ||
167 | - max_i = i | ||
168 | - return max_i | ||
169 | - | ||
170 | - def get_normalized_coords(self): | ||
171 | - # retrieves the objective values of the best solution and normalizes them | ||
172 | - coords = self.predictions[self.index_of_best].objectives | ||
173 | - if self.max_obj1: # avoid divide by zero if all solutions are 0 | ||
174 | - x = coords[0]/self.max_obj1 | ||
175 | - else: | ||
176 | - x = 0.5 | ||
177 | - if self.max_obj2: # avoid divide by zero if all solutions are 0 | ||
178 | - y = coords[1]/self.max_obj2 | ||
179 | - else: | ||
180 | - y = 0.5 | ||
181 | - return ( x,y ) | ||
182 | - | ||
183 | - def get_normalized_coords_ctc(self): | ||
184 | - CRED = '\033[91m' | ||
185 | - CEND = '\033[0m' | ||
186 | - CGREEN = '\33[32m' | ||
187 | - CBLUE = '\33[34m' | ||
188 | - # retrieves the objective values of the best solution and normalizes them | ||
189 | - coords = self.predictions[self.index_of_best_ctc].objectives | ||
190 | - if self.max_obj1: # avoid divide by zero if all solutions are 0 | ||
191 | - x = coords[0]/self.max_obj1 | ||
192 | - else: | ||
193 | - x = 0.5 | ||
194 | - """if(x < 0.5): | ||
195 | - print("\n" + CRED + self.predictions[self.index_of_best_ctc].name + CEND) | ||
196 | - print(CRED + self.predictions[self.index_of_best_ctc].ctc + CEND) | ||
197 | - print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*"))) | ||
198 | - print(CRED + self.true_structure.ctc + CEND) | ||
199 | - print("count: " + str(self.true_structure.ctc.count("*")) + "\n") | ||
200 | - | ||
201 | - elif(x >= 0.5 and type(self.predictions[self.index_of_best_ctc].ctc)) is str: | ||
202 | - print("\n" + CGREEN + self.predictions[self.index_of_best_ctc].name + CEND) | ||
203 | - print(CGREEN + self.predictions[self.index_of_best_ctc].ctc + CEND) | ||
204 | - print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*"))) | ||
205 | - print(CGREEN + self.true_structure.ctc + CEND) | ||
206 | - print("count: " + str(self.true_structure.ctc.count("*")) + "\n")""" | ||
207 | - | ||
208 | - if self.max_obj2: # avoid divide by zero if all solutions are 0 | ||
209 | - y = coords[1]/self.max_obj2 | ||
210 | - else: | ||
211 | - y = 0.5 | ||
212 | - return ( x,y ) | ||
213 | - | ||
214 | -class RNA: | ||
215 | - def __init__(self, filename, header, seq, struct, contacts): | ||
216 | - self.seq_ = seq | ||
217 | - self.header_ = header | ||
218 | - self.struct_ = struct | ||
219 | - self.contacts_ = contacts | ||
220 | - self.basename_ = filename | ||
221 | - | ||
222 | - | ||
223 | -ignored_nt_dict = {} | ||
224 | -def is_canonical_nts(seq): | ||
225 | - for c in seq[:-1]: | ||
226 | - if c not in "ACGU": | ||
227 | - if c in ignored_nt_dict.keys(): | ||
228 | - ignored_nt_dict[c] += 1 | ||
229 | - else: | ||
230 | - ignored_nt_dict[c] = 1 | ||
231 | - return False | ||
232 | - return True | ||
233 | - | ||
234 | -def is_canonical_bps(struct): | ||
235 | - if "()" in struct: | ||
236 | - return False | ||
237 | - if "(.)" in struct: | ||
238 | - return False | ||
239 | - if "(..)" in struct: | ||
240 | - return False | ||
241 | - if "[]" in struct: | ||
242 | - return False | ||
243 | - if "[.]" in struct: | ||
244 | - return False | ||
245 | - if "[..]" in struct: | ||
246 | - return False | ||
247 | - return True | ||
248 | - | ||
249 | -def load_from_dbn(file, header_style=1): | ||
250 | - container = [] | ||
251 | - counter = 0 | ||
252 | - db = open(file, "r") | ||
253 | - c = 0 | ||
254 | - header = "" | ||
255 | - seq = "" | ||
256 | - struct = "" | ||
257 | - while True: | ||
258 | - l = db.readline() | ||
259 | - if l == "": | ||
260 | - break | ||
261 | - c += 1 | ||
262 | - c = c % 4 | ||
263 | - if c == 1: | ||
264 | - header = l[:-1] | ||
265 | - if c == 2: | ||
266 | - seq = l[:-1].upper() | ||
267 | - if c == 3: | ||
268 | - struct = l[:-1] | ||
269 | - n = len(seq) | ||
270 | - if c == 0: | ||
271 | - contacts = l[:-1] | ||
272 | - if is_canonical_nts(seq) and is_canonical_bps(struct): | ||
273 | - if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct, contacts)) | ||
274 | - if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct, contacts)) | ||
275 | - if '[' in struct: counter += 1 | ||
276 | - db.close() | ||
277 | - return container, counter | ||
278 | - | ||
279 | -def parse_biokop(folder, basename, ext=".biok"): | ||
280 | - solutions = [] | ||
281 | - if os.path.isfile(os.path.join(folder, basename + ext)): | ||
282 | - rna = open(os.path.join(folder, basename + ext), "r") | ||
283 | - lines = rna.readlines() | ||
284 | - rna.close() | ||
285 | - different_2ds = [] | ||
286 | - for s in lines[1:]: | ||
287 | - if s == '\n': | ||
288 | - continue | ||
289 | - splitted = s.split('\t') | ||
290 | - db2d = splitted[0] | ||
291 | - if db2d not in different_2ds: | ||
292 | - different_2ds.append(db2d) | ||
293 | - # here is a negative sign because Biokop actually minimizes -MEA instead | ||
294 | - # of maximizing MEA : we switch back to MEA | ||
295 | - solutions.append(SecStruct(basename, db2d, -float(splitted[1]), -float(splitted[2][:-1]))) | ||
296 | - | ||
297 | - # check the range of MEA in this pareto set | ||
298 | - min_mea = solutions[0].objectives[1] | ||
299 | - max_mea = min_mea | ||
300 | - for s in solutions: | ||
301 | - mea = s.objectives[1] | ||
302 | - if mea < min_mea: | ||
303 | - min_mea = mea | ||
304 | - if mea > max_mea: | ||
305 | - max_mea = mea | ||
306 | - | ||
307 | - # normalize so the minimum MEA of the set is 0 | ||
308 | - for i in range(len(solutions)): | ||
309 | - solutions[i].objectives[1] -= min_mea | ||
310 | - | ||
311 | - if len(different_2ds) > 1: | ||
312 | - return solutions | ||
313 | - else: | ||
314 | - print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D solution is found.\033[0m" % (basename)) | ||
315 | - else: | ||
316 | - print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename)) | ||
317 | - | ||
318 | -def parse_biorseo(folder, basename, ext): | ||
319 | - solutions = [] | ||
320 | - print(basename + ext) | ||
321 | - if os.path.isfile(os.path.join(folder, basename + ext)): | ||
322 | - rna = open(os.path.join(folder, basename + ext), "r") | ||
323 | - lines = rna.readlines() | ||
324 | - rna.close() | ||
325 | - different_2ds = [] | ||
326 | - contacts = [] | ||
327 | - str2d = [] | ||
328 | - count = 0; | ||
329 | - for s in lines[2:]: | ||
330 | - count = count + 1 | ||
331 | - if s == '\n': | ||
332 | - continue | ||
333 | - splitted = s.split('\t') | ||
334 | - if(count % 2 == 1): | ||
335 | - obj1 = float(splitted[1]) | ||
336 | - obj2 = float(splitted[2][:-1]) | ||
337 | - db2d = splitted[0].split(' ')[0] | ||
338 | - if db2d not in different_2ds: | ||
339 | - if(s.find('(') != -1): | ||
340 | - different_2ds.append(db2d) | ||
341 | - if(s.find('*') != -1): | ||
342 | - contacts = db2d | ||
343 | - solutions.append(SecStruct(basename, str2d, contacts, obj1, obj2)) | ||
344 | - elif(s.find('(') != -1): | ||
345 | - str2d = db2d | ||
346 | - if len(different_2ds) > 1: | ||
347 | - return solutions | ||
348 | - else: | ||
349 | - print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D or contacts solution is found.\033[0m" % (basename)) | ||
350 | - else: | ||
351 | - print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename)) | ||
352 | - return None | ||
353 | - | ||
354 | -def prettify_biorseo(code): | ||
355 | - name = "" | ||
356 | - if "bgsu" in code: | ||
357 | - name += "RNA 3D Motif Atlas + " | ||
358 | - elif "json" in code: | ||
359 | - name += "Motifs d'Isaure + Direct P.M" | ||
360 | - else: | ||
361 | - name += "Rna3Dmotifs + " | ||
362 | - if "raw" in code: | ||
363 | - name += "Direct P.M." | ||
364 | - if "byp" in code: | ||
365 | - name += "BPairing" | ||
366 | - if "jar3d" in code: | ||
367 | - name += "Jar3d" | ||
368 | - # name += " + $f_{1" + code[-1] + "}$" | ||
369 | - return name | ||
370 | - | ||
371 | -# Parse options | ||
372 | -try: | ||
373 | - opts, args = getopt.getopt( sys.argv[1:], "", | ||
374 | - [ "json_pmE", | ||
375 | - "json_pmF", | ||
376 | - "folder=", | ||
377 | - "database=", | ||
378 | - "output=" | ||
379 | - ]) | ||
380 | -except getopt.GetoptError as err: | ||
381 | - print(err) | ||
382 | - sys.exit(2) | ||
383 | - | ||
384 | -results_folder = "." | ||
385 | -extension = "all" | ||
386 | -outputf = "" | ||
387 | -for opt, arg in opts: | ||
388 | - if opt == "--biokop": | ||
389 | - extension = ".biok" | ||
390 | - parse = parse_biokop | ||
391 | - elif opt == "--folder": | ||
392 | - results_folder = arg | ||
393 | - elif opt == "--database": | ||
394 | - database = arg | ||
395 | - elif opt == "--output": | ||
396 | - outputf = arg | ||
397 | - else: | ||
398 | - extension = '.' + opt[2:] | ||
399 | - parse = parse_biorseo | ||
400 | - | ||
401 | -RNAcontainer, _ = load_from_dbn(database) | ||
402 | - | ||
403 | -if results_folder[-1] != '/': | ||
404 | - results_folder = results_folder + '/' | ||
405 | -if outputf == "": | ||
406 | - outputf = results_folder | ||
407 | -if outputf[-1] != '/': | ||
408 | - outputf = outputf + '/' | ||
409 | - | ||
410 | -def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"): | ||
411 | - points = [] | ||
412 | - sizes = [] | ||
413 | - for rna in RNAcontainer: | ||
414 | - # Extracting the predictions from the results file | ||
415 | - solutions = parse(results_folder, rna.basename_, ext) | ||
416 | - reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf")) | ||
417 | - if solutions is None: | ||
418 | - continue | ||
419 | - pset = Pareto(solutions, reference) | ||
420 | - points.append(pset.get_normalized_coords()) | ||
421 | - sizes.append(pset.n_pred) | ||
422 | - print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2)) | ||
423 | - print("Loaded %d points on %d." % (len(points), len(RNAcontainer))) | ||
424 | - | ||
425 | - x = np.array([ p[0] for p in points ]) | ||
426 | - y = np.array([ p[1] for p in points ]) | ||
427 | - xmin, xmax = 0, 1 | ||
428 | - ymin, ymax = 0, 1 | ||
429 | - xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] | ||
430 | - positions = np.vstack([xx.ravel(), yy.ravel()]) | ||
431 | - values = np.vstack([x, y]) | ||
432 | - kernel = st.gaussian_kde(values) | ||
433 | - f = np.reshape(kernel(positions).T, xx.shape) | ||
434 | - ax[pos].axhline(y=0, alpha=0.2, color='black') | ||
435 | - ax[pos].axhline(y=1, alpha=0.2, color='black') | ||
436 | - ax[pos].axvline(x=0, alpha=0.2, color='black') | ||
437 | - ax[pos].axvline(x=1, alpha=0.2, color='black') | ||
438 | - ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5) | ||
439 | - ax[pos].scatter(x, y, s=25, alpha=0.1) | ||
440 | - ax[pos].set_xlim((-0.1,1.1)) | ||
441 | - ax[pos].set_ylim((-0.1,1.1)) | ||
442 | - ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10) | ||
443 | - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08, 0.15)) | ||
444 | - ax[pos].set_xlabel(xlabel) | ||
445 | - ax[pos].set_ylabel(ylabel) | ||
446 | - | ||
447 | - if nsolutions: | ||
448 | - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar') | ||
449 | - ax[pos+1].set_xlim((0,max(sizes)+2)) | ||
450 | - ax[pos+1].set_xticks(range(0, max(sizes), 10)) | ||
451 | - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90) | ||
452 | - ax[pos+1].set_xlabel("# solutions") | ||
453 | - ax[pos+1].set_ylabel("# RNAs") | ||
454 | - | ||
455 | -def process_extension_ctc(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"): | ||
456 | - points = [] | ||
457 | - sizes = [] | ||
458 | - for rna in RNAcontainer: | ||
459 | - # Extracting the predictions from the results file | ||
460 | - solutions = parse(results_folder, rna.basename_, ext) | ||
461 | - reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf")) | ||
462 | - if solutions is None: | ||
463 | - continue | ||
464 | - pset = Pareto(solutions, reference) | ||
465 | - points.append(pset.get_normalized_coords_ctc()) | ||
466 | - sizes.append(pset.n_pred) | ||
467 | - print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2)) | ||
468 | - print("Loaded %d points on %d." % (len(points), len(RNAcontainer))) | ||
469 | - | ||
470 | - x = np.array([ p[0] for p in points ]) | ||
471 | - y = np.array([ p[1] for p in points ]) | ||
472 | - xmin, xmax = 0, 1 | ||
473 | - ymin, ymax = 0, 1 | ||
474 | - xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] | ||
475 | - positions = np.vstack([xx.ravel(), yy.ravel()]) | ||
476 | - values = np.vstack([x, y]) | ||
477 | - kernel = st.gaussian_kde(values) | ||
478 | - f = np.reshape(kernel(positions).T, xx.shape) | ||
479 | - ax[pos].axhline(y=0, alpha=0.2, color='black') | ||
480 | - ax[pos].axhline(y=1, alpha=0.2, color='black') | ||
481 | - ax[pos].axvline(x=0, alpha=0.2, color='black') | ||
482 | - ax[pos].axvline(x=1, alpha=0.2, color='black') | ||
483 | - ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5) | ||
484 | - ax[pos].scatter(x, y, s=25, alpha=0.1) | ||
485 | - ax[pos].set_xlim((-0.1,1.1)) | ||
486 | - ax[pos].set_ylim((-0.1,1.1)) | ||
487 | - ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10) | ||
488 | - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08,0.15)) | ||
489 | - ax[pos].set_xlabel(xlabel) | ||
490 | - ax[pos].set_ylabel(ylabel) | ||
491 | - | ||
492 | - if nsolutions: | ||
493 | - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar') | ||
494 | - ax[pos+1].set_xlim((0,max(sizes)+2)) | ||
495 | - ax[pos+1].set_xticks(range(0, max(sizes), 10)) | ||
496 | - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90) | ||
497 | - ax[pos+1].set_xlabel("# solutions") | ||
498 | - ax[pos+1].set_ylabel("# RNAs") | ||
499 | - | ||
500 | - | ||
501 | -if extension == "all": | ||
502 | - parse = parse_biorseo | ||
503 | - fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True) | ||
504 | - ax = ax.flatten() | ||
505 | - process_extension(ax, 0, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA") | ||
506 | - print("--------------------------------------------------------------------------------------------") | ||
507 | - process_extension_ctc(ax, 1, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA") | ||
508 | - print("--------------------------------------------------------------------------------------------") | ||
509 | - | ||
510 | - for a in ax: | ||
511 | - a.label_outer() | ||
512 | - plt.subplots_adjust(bottom=0.2, top=0.9, left=0.07, right=0.98, hspace=0.05, wspace=0.05) | ||
513 | - plt.savefig("pareto_visualizer_json_MEA_functionF.png") | ||
514 | -else: | ||
515 | - fig, ax = plt.subplots(2,1, figsize=(6,5)) | ||
516 | - plt.subplots_adjust(bottom=0.12, top=0.9, left=0.15, right=0.9, hspace=0.4) | ||
517 | - if extension == ".biok": | ||
518 | - process_extension(ax, 0, extension, nsolutions=True, xlabel="Normalized MFE", ylabel="Normalized MFE") | ||
519 | - else: | ||
520 | - process_extension(ax, 0, extension, nsolutions=False) | ||
521 | - plt.savefig("pareto_visualizer_ext.png") | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment