Louis BECQUEY

cleaned scripts/ and figures/ folders

1 +What are this RNA data files ?
2 +===============================
3 +
4 +## Raw (big) databases
5 +* RNA-Strand 2.0 (secondary_structures_database.dbn) : this file is a dataset supposed to be identical to RNA-Strand 2.0 (actually the file is present on IBISC machines for years now and nobody remembers how it was built). The former RNA Strand website is not online anymore (http://rnasoft.ca/strand).
6 +* bpRNA-1m_90 : this huge database gathers the data from other databases (CRW, PDB, Rfam, RNP, SPR, SRP, ...) and superseeds RNA-Strand (minus the structures that are only in NDB, sadly). Sequences have been prefiltered to have no more than 90% identity. Source : http://bprna.cgrb.oregonstate.edu/
7 +* Pseudobase(++) : A database of biologically validated pseudoknots, from the time discovering a pseudoknot was something unusual. Pseudobase stays famous for its pseudoknot classification scheme. I scraped it myself to build the file. Source : https://www.ekevanbatenburg.nl/PKBASE/PKB.HTML
8 +
9 +
10 +## Filtered databases
11 +* verified_secondary_structures.dbn : The subset of RNA-Strand that was experimentally validated (basically, the ones for which a 3D structure was available, so the ones from NDB and PDB).
12 +* The _short.dbn ones : Same as its parent, but filtered using the filter.py script.
13 +* pseudoknots.dbn : Audrey Legendre's scrap of Pseudobase, which, for an unknow reason, does not contain all the available data, but nice descriptions of what the RNAs are.
14 +
15 +
16 +## Small test databases
17 +* RNA-MoIP dataset : The cherry-picked cases presented in Reinhartz et al. 2012 to show RNA-MoIP's performance.
18 +* applications.dbn : My cherry-picked cases presented in Becquey et al. 2020 to show Biorseo's performance.
19 +* example.dbn : an example database with only one RNA, for testing purposes
20 +* nothing.dbn : an example database with no RNAs, for testing purposes
21 +
22 +
23 +Enjoy benchmarking RNA structure prediction tools.
...\ No newline at end of file ...\ No newline at end of file

99.5 KB | W: | H:

156 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin

32.4 KB | W: | H:

30.4 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
...@@ -158,7 +158,6 @@ def is_canonical_nts(seq): ...@@ -158,7 +158,6 @@ def is_canonical_nts(seq):
158 return False 158 return False
159 return True 159 return True
160 160
161 -
162 def is_canonical_bps(struct): 161 def is_canonical_bps(struct):
163 if "()" in struct: 162 if "()" in struct:
164 return False 163 return False
...@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3): ...@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3):
207 db.close() 206 db.close()
208 return container, pkcounter 207 return container, pkcounter
209 208
210 -
211 def parse_biokop(folder, basename, ext=".biok"): 209 def parse_biokop(folder, basename, ext=".biok"):
212 solutions = [] 210 solutions = []
213 err = 0 211 err = 0
...@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"): ...@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"):
248 err = 1 246 err = 1
249 return None, err 247 return None, err
250 248
251 -
252 def parse_biorseo(folder, basename, ext): 249 def parse_biorseo(folder, basename, ext):
253 solutions = [] 250 solutions = []
254 err = 0 251 err = 0
...@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext): ...@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext):
272 err = 1 269 err = 1
273 return None, err 270 return None, err
274 271
275 -
276 def prettify_biorseo(code): 272 def prettify_biorseo(code):
277 name = "" 273 name = ""
278 - if "bgsu" in code: 274 + if "json" in code:
279 - name += "RNA 3D Motif Atlas + " 275 + name += "JSON motifs + "
280 elif "rin" in code: 276 elif "rin" in code:
281 name += "CaRNAval + " 277 name += "CaRNAval + "
282 else: 278 else:
283 name += "Rna3Dmotifs + " 279 name += "Rna3Dmotifs + "
284 - if "raw" in code:
285 - name += "Direct P.M."
286 - if "byp" in code:
287 - name += "BPairing"
288 - if "jar3d" in code:
289 - name += "Jar3d"
290 # name += " + $f_{1" + code[-1] + "}$" 280 # name += " + $f_{1" + code[-1] + "}$"
291 return name 281 return name
292 282
...@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf ...@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
342 if __name__ == "__main__": 332 if __name__ == "__main__":
343 try: 333 try:
344 opts, args = getopt.getopt( sys.argv[1:], "", 334 opts, args = getopt.getopt( sys.argv[1:], "",
345 - [ "biorseo_desc_byp_A", "biorseo_desc_byp_B", 335 + [ "biorseo_desc_A", "biorseo_desc_B",
346 - "biorseo_desc_byp_C", "biorseo_desc_byp_D", 336 + "biorseo_rin_A", "biorseo_rin_B",
347 - "biorseo_bgsu_byp_A", "biorseo_bgsu_byp_B", 337 + "biorseo_json_A", "biorseo_json_B",
348 - "biorseo_bgsu_byp_C", "biorseo_bgsu_byp_D",
349 - "biorseo_desc_raw_A", "biorseo_desc_raw_B",
350 - "biorseo_bgsu_jar3d_A", "biorseo_bgsu_jar3d_B",
351 - "biorseo_bgsu_jar3d_C", "biorseo_bgsu_jar3d_D",
352 - "biorseo_rin_raw_A", "biorseo_rin_raw_B",
353 "biokop", "folder=", "database=", "output=" 338 "biokop", "folder=", "database=", "output="
354 ]) 339 ])
355 except getopt.GetoptError as err: 340 except getopt.GetoptError as err:
...@@ -384,36 +369,19 @@ if __name__ == "__main__": ...@@ -384,36 +369,19 @@ if __name__ == "__main__":
384 369
385 if extension == "all": 370 if extension == "all":
386 parse = parse_biorseo 371 parse = parse_biorseo
387 - fig, ax = plt.subplots(4,5,figsize=(12,10), sharex=True, sharey=True) 372 + fig, ax = plt.subplots(2,3,figsize=(8,10), sharex=True, sharey=True)
388 ax = ax.flatten() 373 ax = ax.flatten()
389 - process_extension(ax, 0, ".biorseo_desc_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 374 + process_extension(ax, 0, ".biorseo_desc_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
390 - process_extension(ax, 1, ".biorseo_rin_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 375 + process_extension(ax, 1, ".biorseo_rin_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
391 - process_extension(ax, 2, ".biorseo_desc_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 376 + process_extension(ax, 2, ".biorseo_json_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
392 - process_extension(ax, 3, ".biorseo_bgsu_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 377 + ax[0].set_title(prettify_biorseo("biorseo_desc_A"), fontsize=10)
393 - process_extension(ax, 4, ".biorseo_bgsu_jar3d_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA") 378 + ax[1].set_title(prettify_biorseo("biorseo_rin_A"), fontsize=10)
394 - ax[0].set_title(prettify_biorseo("biorseo_desc_raw_A"), fontsize=10) 379 + ax[2].set_title(prettify_biorseo("biorseo_json_A"), fontsize=10)
395 - ax[1].set_title(prettify_biorseo("biorseo_rin_raw_A"), fontsize=10) 380 +
396 - ax[2].set_title(prettify_biorseo("biorseo_desc_byp_A"), fontsize=10) 381 + process_extension(ax, 3, ".biorseo_desc_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
397 - ax[3].set_title(prettify_biorseo("biorseo_bgsu_byp_A"), fontsize=10) 382 + process_extension(ax, 4, ".biorseo_rin_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
398 - ax[4].set_title(prettify_biorseo("biorseo_bgsu_jar3d_A"), fontsize=10) 383 + process_extension(ax, 5, ".biorseo_json_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
399 - 384 +
400 - process_extension(ax, 5, ".biorseo_desc_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
401 - process_extension(ax, 6, ".biorseo_rin_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
402 - process_extension(ax, 7, ".biorseo_desc_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
403 - process_extension(ax, 8, ".biorseo_bgsu_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
404 - process_extension(ax, 9, ".biorseo_bgsu_jar3d_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
405 -
406 - process_extension(ax, 12, ".biorseo_desc_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
407 - process_extension(ax, 13, ".biorseo_bgsu_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
408 - process_extension(ax, 14, ".biorseo_bgsu_jar3d_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
409 - ax[10].axis("off")
410 - ax[11].axis("off")
411 -
412 - process_extension(ax, 17, ".biorseo_desc_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
413 - process_extension(ax, 18, ".biorseo_bgsu_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
414 - process_extension(ax, 19, ".biorseo_bgsu_jar3d_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
415 - ax[15].axis("off")
416 - ax[16].axis("off")
417 for a in ax: 385 for a in ax:
418 a.label_outer() 386 a.label_outer()
419 plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05) 387 plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05)
......
1 -#!/usr/bin/python3
2 -# Created by Louis Becquey, louis.becquey@univ-evry.fr, Oct 2019
3 -# This script processes files containing RNA structures obtained from bi-objective
4 -# optimization programs, and a dot-bracket database of reference structures, to plot
5 -# where are the best solutions in the Pareto set.
6 -#
7 -# The result files should follow this kind of format:
8 -# for Biokop: (option --biokop)
9 -# Structure Free energy score Expected accuracy score
10 -# (((...(((...)))))) <tab> obj1_value <tab> obj2_value
11 -# (((............))) <tab> obj1_value <tab> obj2_value
12 -# ((((((...)))...))) <tab> obj1_value <tab> obj2_value
13 -# ...
14 -#
15 -# for BiORSEO: (options --biorseo_**stuff**)
16 -# >Header of the sequence
17 -# GGCACAGAGUUAUGUGCC
18 -# (((...(((...)))))) + Motif1 + Motif2 <tab> obj1_value <tab> obj2_value
19 -# (((............))) <tab> obj1_value <tab> obj2_value
20 -# ((((((...)))...))) + Motif1 <tab> obj1_value <tab> obj2_value
21 -#
22 -# typical Biokop usage:
23 -# python3 pareto_visualizer.py --biokop --folder path/to/your/results/folder --database path/to/the/database_file.dbn
24 -# typical Biorseo usage:
25 -# python3 pareto_visualizer_json.py --folder path/to/your/results/folder (pmE et pmF) --database path/to/the/database_file.dbn (nom, sequence, structure)
26 -#
27 -
28 -from math import sqrt
29 -import numpy as np
30 -import matplotlib.pyplot as plt
31 -from matplotlib import cm
32 -import scipy.stats as st
33 -import sys
34 -import os
35 -import subprocess
36 -import getopt
37 -
38 -class SecStruct:
39 - def __init__(self, name, dot_bracket, contacts, obj1_value, obj2_value):
40 - self.name = name
41 - self.dbn = dot_bracket
42 - self.ctc = contacts
43 - self.objectives = [ obj1_value, obj2_value ]
44 - self.basepair_list = self.get_basepairs()
45 - self.length = len(dot_bracket)
46 -
47 - def get_basepairs(self):
48 - parenthesis = []
49 - brackets = []
50 - braces = []
51 - rafters = []
52 - basepairs = []
53 - As = []
54 - Bs = []
55 - for i, c in enumerate(self.dbn):
56 - if c == '(':
57 - parenthesis.append(i)
58 - if c == '[':
59 - brackets.append(i)
60 - if c == '{':
61 - braces.append(i)
62 - if c == '<':
63 - rafters.append(i)
64 - if c == 'A':
65 - As.append(i)
66 - if c == 'B':
67 - Bs.append(i)
68 - if c == '.':
69 - continue
70 - if c == ')':
71 - basepairs.append((i, parenthesis.pop()))
72 - if c == ']':
73 - basepairs.append((i, brackets.pop()))
74 - if c == '}':
75 - basepairs.append((i, braces.pop()))
76 - if c == '>':
77 - basepairs.append((i, rafters.pop()))
78 - if c == 'a':
79 - basepairs.append((i, As.pop()))
80 - if c == 'b':
81 - basepairs.append((i, Bs.pop()))
82 - return basepairs
83 -
84 - def get_MCC_with(self, reference_structure):
85 - # Get true and false positives and negatives
86 - tp = 0
87 - fp = 0
88 - tn = 0
89 - fn = 0
90 - for bp in reference_structure.basepair_list:
91 - if bp in self.basepair_list:
92 - tp += 1
93 - else:
94 - fn += 1
95 - for bp in self.basepair_list:
96 - if bp not in reference_structure.basepair_list:
97 - fp += 1
98 - tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
99 -
100 - # Compute MCC
101 - if (tp+fp == 0):
102 - print("We have an issue : no positives detected ! (linear structure)")
103 - return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
104 -
105 - def get_MCC_ctc_with(self, reference_structure):
106 - # Get true and false positives and negatives
107 - tp = 0
108 - fp = 0
109 - tn = 0
110 - fn = 0
111 - prediction = self.ctc
112 - true_ctc = reference_structure.ctc
113 - for i in range(len(true_ctc)):
114 - if true_ctc[i] == '*' and prediction[i] == '*':
115 - tp += 1
116 - elif true_ctc[i] == '.' and prediction[i] == '.':
117 - tn += 1
118 - elif true_ctc[i] == '.' and prediction[i] == '*':
119 - fp += 1
120 - elif true_ctc[i] == '*' and prediction[i] == '.':
121 - fn += 1
122 - # print(str(tp) + " " + str(tn) + " " + str(fp) + " " + str(fn) + "\n")
123 -
124 - result = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
125 - # Compute MCC
126 - if ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0):
127 - print("warning: division by zero!")
128 - return None
129 - elif (tp + fp == 0):
130 - print("We have an issue : no positives detected ! (linear structure)")
131 - return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
132 -
133 -class Pareto:
134 - def __init__(self, list_of_structs, reference):
135 - self.predictions = list_of_structs
136 - self.true_structure = reference
137 - self.n_pred = len(list_of_structs)
138 - self.max_obj1 = max([s.objectives[0] for s in self.predictions ])
139 - self.max_obj2 = max([s.objectives[1] for s in self.predictions ])
140 - self.index_of_best = self.find_best_solution()
141 - self.index_of_best_ctc = self.find_best_solution_ctc()
142 -
143 - def find_best_solution(self):
144 - # returns the index of the solution of the Pareto set which is the closest
145 - # to the real 2D structure (the one with the max MCC)
146 - max_i = -1
147 - max_mcc = -1
148 - for i,s in enumerate(self.predictions):
149 - mcc = s.get_MCC_with(self.true_structure)
150 - if mcc > max_mcc:
151 - max_mcc = mcc
152 - max_i = i
153 - print("\n" + "max mcc str: " + str(max_mcc))
154 - return max_i
155 -
156 - def find_best_solution_ctc(self):
157 - # returns the index of the solution of the Pareto set which is the closest
158 - # to the real contacts area (the one with the max MCC)
159 - max_i = -1
160 - max_mcc = -1
161 - for i,s in enumerate(self.predictions):
162 - mcc = s.get_MCC_ctc_with(self.true_structure)
163 - if mcc is None:
164 - continue
165 - elif mcc > max_mcc:
166 - max_mcc = mcc
167 - max_i = i
168 - return max_i
169 -
170 - def get_normalized_coords(self):
171 - # retrieves the objective values of the best solution and normalizes them
172 - coords = self.predictions[self.index_of_best].objectives
173 - if self.max_obj1: # avoid divide by zero if all solutions are 0
174 - x = coords[0]/self.max_obj1
175 - else:
176 - x = 0.5
177 - if self.max_obj2: # avoid divide by zero if all solutions are 0
178 - y = coords[1]/self.max_obj2
179 - else:
180 - y = 0.5
181 - return ( x,y )
182 -
183 - def get_normalized_coords_ctc(self):
184 - CRED = '\033[91m'
185 - CEND = '\033[0m'
186 - CGREEN = '\33[32m'
187 - CBLUE = '\33[34m'
188 - # retrieves the objective values of the best solution and normalizes them
189 - coords = self.predictions[self.index_of_best_ctc].objectives
190 - if self.max_obj1: # avoid divide by zero if all solutions are 0
191 - x = coords[0]/self.max_obj1
192 - else:
193 - x = 0.5
194 - """if(x < 0.5):
195 - print("\n" + CRED + self.predictions[self.index_of_best_ctc].name + CEND)
196 - print(CRED + self.predictions[self.index_of_best_ctc].ctc + CEND)
197 - print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*")))
198 - print(CRED + self.true_structure.ctc + CEND)
199 - print("count: " + str(self.true_structure.ctc.count("*")) + "\n")
200 -
201 - elif(x >= 0.5 and type(self.predictions[self.index_of_best_ctc].ctc)) is str:
202 - print("\n" + CGREEN + self.predictions[self.index_of_best_ctc].name + CEND)
203 - print(CGREEN + self.predictions[self.index_of_best_ctc].ctc + CEND)
204 - print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*")))
205 - print(CGREEN + self.true_structure.ctc + CEND)
206 - print("count: " + str(self.true_structure.ctc.count("*")) + "\n")"""
207 -
208 - if self.max_obj2: # avoid divide by zero if all solutions are 0
209 - y = coords[1]/self.max_obj2
210 - else:
211 - y = 0.5
212 - return ( x,y )
213 -
214 -class RNA:
215 - def __init__(self, filename, header, seq, struct, contacts):
216 - self.seq_ = seq
217 - self.header_ = header
218 - self.struct_ = struct
219 - self.contacts_ = contacts
220 - self.basename_ = filename
221 -
222 -
223 -ignored_nt_dict = {}
224 -def is_canonical_nts(seq):
225 - for c in seq[:-1]:
226 - if c not in "ACGU":
227 - if c in ignored_nt_dict.keys():
228 - ignored_nt_dict[c] += 1
229 - else:
230 - ignored_nt_dict[c] = 1
231 - return False
232 - return True
233 -
234 -def is_canonical_bps(struct):
235 - if "()" in struct:
236 - return False
237 - if "(.)" in struct:
238 - return False
239 - if "(..)" in struct:
240 - return False
241 - if "[]" in struct:
242 - return False
243 - if "[.]" in struct:
244 - return False
245 - if "[..]" in struct:
246 - return False
247 - return True
248 -
249 -def load_from_dbn(file, header_style=1):
250 - container = []
251 - counter = 0
252 - db = open(file, "r")
253 - c = 0
254 - header = ""
255 - seq = ""
256 - struct = ""
257 - while True:
258 - l = db.readline()
259 - if l == "":
260 - break
261 - c += 1
262 - c = c % 4
263 - if c == 1:
264 - header = l[:-1]
265 - if c == 2:
266 - seq = l[:-1].upper()
267 - if c == 3:
268 - struct = l[:-1]
269 - n = len(seq)
270 - if c == 0:
271 - contacts = l[:-1]
272 - if is_canonical_nts(seq) and is_canonical_bps(struct):
273 - if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct, contacts))
274 - if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct, contacts))
275 - if '[' in struct: counter += 1
276 - db.close()
277 - return container, counter
278 -
279 -def parse_biokop(folder, basename, ext=".biok"):
280 - solutions = []
281 - if os.path.isfile(os.path.join(folder, basename + ext)):
282 - rna = open(os.path.join(folder, basename + ext), "r")
283 - lines = rna.readlines()
284 - rna.close()
285 - different_2ds = []
286 - for s in lines[1:]:
287 - if s == '\n':
288 - continue
289 - splitted = s.split('\t')
290 - db2d = splitted[0]
291 - if db2d not in different_2ds:
292 - different_2ds.append(db2d)
293 - # here is a negative sign because Biokop actually minimizes -MEA instead
294 - # of maximizing MEA : we switch back to MEA
295 - solutions.append(SecStruct(basename, db2d, -float(splitted[1]), -float(splitted[2][:-1])))
296 -
297 - # check the range of MEA in this pareto set
298 - min_mea = solutions[0].objectives[1]
299 - max_mea = min_mea
300 - for s in solutions:
301 - mea = s.objectives[1]
302 - if mea < min_mea:
303 - min_mea = mea
304 - if mea > max_mea:
305 - max_mea = mea
306 -
307 - # normalize so the minimum MEA of the set is 0
308 - for i in range(len(solutions)):
309 - solutions[i].objectives[1] -= min_mea
310 -
311 - if len(different_2ds) > 1:
312 - return solutions
313 - else:
314 - print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D solution is found.\033[0m" % (basename))
315 - else:
316 - print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename))
317 -
318 -def parse_biorseo(folder, basename, ext):
319 - solutions = []
320 - print(basename + ext)
321 - if os.path.isfile(os.path.join(folder, basename + ext)):
322 - rna = open(os.path.join(folder, basename + ext), "r")
323 - lines = rna.readlines()
324 - rna.close()
325 - different_2ds = []
326 - contacts = []
327 - str2d = []
328 - count = 0;
329 - for s in lines[2:]:
330 - count = count + 1
331 - if s == '\n':
332 - continue
333 - splitted = s.split('\t')
334 - if(count % 2 == 1):
335 - obj1 = float(splitted[1])
336 - obj2 = float(splitted[2][:-1])
337 - db2d = splitted[0].split(' ')[0]
338 - if db2d not in different_2ds:
339 - if(s.find('(') != -1):
340 - different_2ds.append(db2d)
341 - if(s.find('*') != -1):
342 - contacts = db2d
343 - solutions.append(SecStruct(basename, str2d, contacts, obj1, obj2))
344 - elif(s.find('(') != -1):
345 - str2d = db2d
346 - if len(different_2ds) > 1:
347 - return solutions
348 - else:
349 - print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D or contacts solution is found.\033[0m" % (basename))
350 - else:
351 - print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename))
352 - return None
353 -
354 -def prettify_biorseo(code):
355 - name = ""
356 - if "bgsu" in code:
357 - name += "RNA 3D Motif Atlas + "
358 - elif "json" in code:
359 - name += "Motifs d'Isaure + Direct P.M"
360 - else:
361 - name += "Rna3Dmotifs + "
362 - if "raw" in code:
363 - name += "Direct P.M."
364 - if "byp" in code:
365 - name += "BPairing"
366 - if "jar3d" in code:
367 - name += "Jar3d"
368 - # name += " + $f_{1" + code[-1] + "}$"
369 - return name
370 -
371 -# Parse options
372 -try:
373 - opts, args = getopt.getopt( sys.argv[1:], "",
374 - [ "json_pmE",
375 - "json_pmF",
376 - "folder=",
377 - "database=",
378 - "output="
379 - ])
380 -except getopt.GetoptError as err:
381 - print(err)
382 - sys.exit(2)
383 -
384 -results_folder = "."
385 -extension = "all"
386 -outputf = ""
387 -for opt, arg in opts:
388 - if opt == "--biokop":
389 - extension = ".biok"
390 - parse = parse_biokop
391 - elif opt == "--folder":
392 - results_folder = arg
393 - elif opt == "--database":
394 - database = arg
395 - elif opt == "--output":
396 - outputf = arg
397 - else:
398 - extension = '.' + opt[2:]
399 - parse = parse_biorseo
400 -
401 -RNAcontainer, _ = load_from_dbn(database)
402 -
403 -if results_folder[-1] != '/':
404 - results_folder = results_folder + '/'
405 -if outputf == "":
406 - outputf = results_folder
407 -if outputf[-1] != '/':
408 - outputf = outputf + '/'
409 -
410 -def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"):
411 - points = []
412 - sizes = []
413 - for rna in RNAcontainer:
414 - # Extracting the predictions from the results file
415 - solutions = parse(results_folder, rna.basename_, ext)
416 - reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf"))
417 - if solutions is None:
418 - continue
419 - pset = Pareto(solutions, reference)
420 - points.append(pset.get_normalized_coords())
421 - sizes.append(pset.n_pred)
422 - print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
423 - print("Loaded %d points on %d." % (len(points), len(RNAcontainer)))
424 -
425 - x = np.array([ p[0] for p in points ])
426 - y = np.array([ p[1] for p in points ])
427 - xmin, xmax = 0, 1
428 - ymin, ymax = 0, 1
429 - xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
430 - positions = np.vstack([xx.ravel(), yy.ravel()])
431 - values = np.vstack([x, y])
432 - kernel = st.gaussian_kde(values)
433 - f = np.reshape(kernel(positions).T, xx.shape)
434 - ax[pos].axhline(y=0, alpha=0.2, color='black')
435 - ax[pos].axhline(y=1, alpha=0.2, color='black')
436 - ax[pos].axvline(x=0, alpha=0.2, color='black')
437 - ax[pos].axvline(x=1, alpha=0.2, color='black')
438 - ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
439 - ax[pos].scatter(x, y, s=25, alpha=0.1)
440 - ax[pos].set_xlim((-0.1,1.1))
441 - ax[pos].set_ylim((-0.1,1.1))
442 - ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
443 - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08, 0.15))
444 - ax[pos].set_xlabel(xlabel)
445 - ax[pos].set_ylabel(ylabel)
446 -
447 - if nsolutions:
448 - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
449 - ax[pos+1].set_xlim((0,max(sizes)+2))
450 - ax[pos+1].set_xticks(range(0, max(sizes), 10))
451 - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
452 - ax[pos+1].set_xlabel("# solutions")
453 - ax[pos+1].set_ylabel("# RNAs")
454 -
455 -def process_extension_ctc(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"):
456 - points = []
457 - sizes = []
458 - for rna in RNAcontainer:
459 - # Extracting the predictions from the results file
460 - solutions = parse(results_folder, rna.basename_, ext)
461 - reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf"))
462 - if solutions is None:
463 - continue
464 - pset = Pareto(solutions, reference)
465 - points.append(pset.get_normalized_coords_ctc())
466 - sizes.append(pset.n_pred)
467 - print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
468 - print("Loaded %d points on %d." % (len(points), len(RNAcontainer)))
469 -
470 - x = np.array([ p[0] for p in points ])
471 - y = np.array([ p[1] for p in points ])
472 - xmin, xmax = 0, 1
473 - ymin, ymax = 0, 1
474 - xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
475 - positions = np.vstack([xx.ravel(), yy.ravel()])
476 - values = np.vstack([x, y])
477 - kernel = st.gaussian_kde(values)
478 - f = np.reshape(kernel(positions).T, xx.shape)
479 - ax[pos].axhline(y=0, alpha=0.2, color='black')
480 - ax[pos].axhline(y=1, alpha=0.2, color='black')
481 - ax[pos].axvline(x=0, alpha=0.2, color='black')
482 - ax[pos].axvline(x=1, alpha=0.2, color='black')
483 - ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
484 - ax[pos].scatter(x, y, s=25, alpha=0.1)
485 - ax[pos].set_xlim((-0.1,1.1))
486 - ax[pos].set_ylim((-0.1,1.1))
487 - ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
488 - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08,0.15))
489 - ax[pos].set_xlabel(xlabel)
490 - ax[pos].set_ylabel(ylabel)
491 -
492 - if nsolutions:
493 - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
494 - ax[pos+1].set_xlim((0,max(sizes)+2))
495 - ax[pos+1].set_xticks(range(0, max(sizes), 10))
496 - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
497 - ax[pos+1].set_xlabel("# solutions")
498 - ax[pos+1].set_ylabel("# RNAs")
499 -
500 -
501 -if extension == "all":
502 - parse = parse_biorseo
503 - fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
504 - ax = ax.flatten()
505 - process_extension(ax, 0, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA")
506 - print("--------------------------------------------------------------------------------------------")
507 - process_extension_ctc(ax, 1, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA")
508 - print("--------------------------------------------------------------------------------------------")
509 -
510 - for a in ax:
511 - a.label_outer()
512 - plt.subplots_adjust(bottom=0.2, top=0.9, left=0.07, right=0.98, hspace=0.05, wspace=0.05)
513 - plt.savefig("pareto_visualizer_json_MEA_functionF.png")
514 -else:
515 - fig, ax = plt.subplots(2,1, figsize=(6,5))
516 - plt.subplots_adjust(bottom=0.12, top=0.9, left=0.15, right=0.9, hspace=0.4)
517 - if extension == ".biok":
518 - process_extension(ax, 0, extension, nsolutions=True, xlabel="Normalized MFE", ylabel="Normalized MFE")
519 - else:
520 - process_extension(ax, 0, extension, nsolutions=False)
521 - plt.savefig("pareto_visualizer_ext.png")
...\ No newline at end of file ...\ No newline at end of file