Louis BECQUEY

cleaned scripts/ and figures/ folders

What are this RNA data files ?
===============================
## Raw (big) databases
* RNA-Strand 2.0 (secondary_structures_database.dbn) : this file is a dataset supposed to be identical to RNA-Strand 2.0 (actually the file is present on IBISC machines for years now and nobody remembers how it was built). The former RNA Strand website is not online anymore (http://rnasoft.ca/strand).
* bpRNA-1m_90 : this huge database gathers the data from other databases (CRW, PDB, Rfam, RNP, SPR, SRP, ...) and superseeds RNA-Strand (minus the structures that are only in NDB, sadly). Sequences have been prefiltered to have no more than 90% identity. Source : http://bprna.cgrb.oregonstate.edu/
* Pseudobase(++) : A database of biologically validated pseudoknots, from the time discovering a pseudoknot was something unusual. Pseudobase stays famous for its pseudoknot classification scheme. I scraped it myself to build the file. Source : https://www.ekevanbatenburg.nl/PKBASE/PKB.HTML
## Filtered databases
* verified_secondary_structures.dbn : The subset of RNA-Strand that was experimentally validated (basically, the ones for which a 3D structure was available, so the ones from NDB and PDB).
* The _short.dbn ones : Same as its parent, but filtered using the filter.py script.
* pseudoknots.dbn : Audrey Legendre's scrap of Pseudobase, which, for an unknow reason, does not contain all the available data, but nice descriptions of what the RNAs are.
## Small test databases
* RNA-MoIP dataset : The cherry-picked cases presented in Reinhartz et al. 2012 to show RNA-MoIP's performance.
* applications.dbn : My cherry-picked cases presented in Becquey et al. 2020 to show Biorseo's performance.
* example.dbn : an example database with only one RNA, for testing purposes
* nothing.dbn : an example database with no RNAs, for testing purposes
Enjoy benchmarking RNA structure prediction tools.
\ No newline at end of file

99.5 KB | W: | H:

156 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin

32.4 KB | W: | H:

30.4 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
......@@ -158,7 +158,6 @@ def is_canonical_nts(seq):
return False
return True
def is_canonical_bps(struct):
if "()" in struct:
return False
......@@ -207,7 +206,6 @@ def load_from_dbn(file, header_style=3):
db.close()
return container, pkcounter
def parse_biokop(folder, basename, ext=".biok"):
solutions = []
err = 0
......@@ -248,7 +246,6 @@ def parse_biokop(folder, basename, ext=".biok"):
err = 1
return None, err
def parse_biorseo(folder, basename, ext):
solutions = []
err = 0
......@@ -272,21 +269,14 @@ def parse_biorseo(folder, basename, ext):
err = 1
return None, err
def prettify_biorseo(code):
name = ""
if "bgsu" in code:
name += "RNA 3D Motif Atlas + "
if "json" in code:
name += "JSON motifs + "
elif "rin" in code:
name += "CaRNAval + "
else:
name += "Rna3Dmotifs + "
if "raw" in code:
name += "Direct P.M."
if "byp" in code:
name += "BPairing"
if "jar3d" in code:
name += "Jar3d"
# name += " + $f_{1" + code[-1] + "}$"
return name
......@@ -342,14 +332,9 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
if __name__ == "__main__":
try:
opts, args = getopt.getopt( sys.argv[1:], "",
[ "biorseo_desc_byp_A", "biorseo_desc_byp_B",
"biorseo_desc_byp_C", "biorseo_desc_byp_D",
"biorseo_bgsu_byp_A", "biorseo_bgsu_byp_B",
"biorseo_bgsu_byp_C", "biorseo_bgsu_byp_D",
"biorseo_desc_raw_A", "biorseo_desc_raw_B",
"biorseo_bgsu_jar3d_A", "biorseo_bgsu_jar3d_B",
"biorseo_bgsu_jar3d_C", "biorseo_bgsu_jar3d_D",
"biorseo_rin_raw_A", "biorseo_rin_raw_B",
[ "biorseo_desc_A", "biorseo_desc_B",
"biorseo_rin_A", "biorseo_rin_B",
"biorseo_json_A", "biorseo_json_B",
"biokop", "folder=", "database=", "output="
])
except getopt.GetoptError as err:
......@@ -384,36 +369,19 @@ if __name__ == "__main__":
if extension == "all":
parse = parse_biorseo
fig, ax = plt.subplots(4,5,figsize=(12,10), sharex=True, sharey=True)
fig, ax = plt.subplots(2,3,figsize=(8,10), sharex=True, sharey=True)
ax = ax.flatten()
process_extension(ax, 0, ".biorseo_desc_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 1, ".biorseo_rin_raw_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 2, ".biorseo_desc_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 3, ".biorseo_bgsu_byp_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 4, ".biorseo_bgsu_jar3d_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
ax[0].set_title(prettify_biorseo("biorseo_desc_raw_A"), fontsize=10)
ax[1].set_title(prettify_biorseo("biorseo_rin_raw_A"), fontsize=10)
ax[2].set_title(prettify_biorseo("biorseo_desc_byp_A"), fontsize=10)
ax[3].set_title(prettify_biorseo("biorseo_bgsu_byp_A"), fontsize=10)
ax[4].set_title(prettify_biorseo("biorseo_bgsu_jar3d_A"), fontsize=10)
process_extension(ax, 5, ".biorseo_desc_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 6, ".biorseo_rin_raw_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 7, ".biorseo_desc_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 8, ".biorseo_bgsu_byp_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 9, ".biorseo_bgsu_jar3d_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 12, ".biorseo_desc_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
process_extension(ax, 13, ".biorseo_bgsu_byp_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
process_extension(ax, 14, ".biorseo_bgsu_jar3d_C", ylabel="Normalized $f_{1C}$", xlabel="Normalized MEA")
ax[10].axis("off")
ax[11].axis("off")
process_extension(ax, 17, ".biorseo_desc_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
process_extension(ax, 18, ".biorseo_bgsu_byp_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
process_extension(ax, 19, ".biorseo_bgsu_jar3d_D", ylabel="Normalized $f_{1D}$", xlabel="Normalized MEA")
ax[15].axis("off")
ax[16].axis("off")
process_extension(ax, 0, ".biorseo_desc_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 1, ".biorseo_rin_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
process_extension(ax, 2, ".biorseo_json_A", ylabel="Normalized $f_{1A}$", xlabel="Normalized MEA")
ax[0].set_title(prettify_biorseo("biorseo_desc_A"), fontsize=10)
ax[1].set_title(prettify_biorseo("biorseo_rin_A"), fontsize=10)
ax[2].set_title(prettify_biorseo("biorseo_json_A"), fontsize=10)
process_extension(ax, 3, ".biorseo_desc_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 4, ".biorseo_rin_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
process_extension(ax, 5, ".biorseo_json_B", ylabel="Normalized $f_{1B}$", xlabel="Normalized MEA")
for a in ax:
a.label_outer()
plt.subplots_adjust(bottom=0.05, top=0.95, left=0.07, right=0.98, hspace=0.1, wspace = 0.05)
......
#!/usr/bin/python3
# Created by Louis Becquey, louis.becquey@univ-evry.fr, Oct 2019
# This script processes files containing RNA structures obtained from bi-objective
# optimization programs, and a dot-bracket database of reference structures, to plot
# where are the best solutions in the Pareto set.
#
# The result files should follow this kind of format:
# for Biokop: (option --biokop)
# Structure Free energy score Expected accuracy score
# (((...(((...)))))) <tab> obj1_value <tab> obj2_value
# (((............))) <tab> obj1_value <tab> obj2_value
# ((((((...)))...))) <tab> obj1_value <tab> obj2_value
# ...
#
# for BiORSEO: (options --biorseo_**stuff**)
# >Header of the sequence
# GGCACAGAGUUAUGUGCC
# (((...(((...)))))) + Motif1 + Motif2 <tab> obj1_value <tab> obj2_value
# (((............))) <tab> obj1_value <tab> obj2_value
# ((((((...)))...))) + Motif1 <tab> obj1_value <tab> obj2_value
#
# typical Biokop usage:
# python3 pareto_visualizer.py --biokop --folder path/to/your/results/folder --database path/to/the/database_file.dbn
# typical Biorseo usage:
# python3 pareto_visualizer_json.py --folder path/to/your/results/folder (pmE et pmF) --database path/to/the/database_file.dbn (nom, sequence, structure)
#
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import scipy.stats as st
import sys
import os
import subprocess
import getopt
class SecStruct:
def __init__(self, name, dot_bracket, contacts, obj1_value, obj2_value):
self.name = name
self.dbn = dot_bracket
self.ctc = contacts
self.objectives = [ obj1_value, obj2_value ]
self.basepair_list = self.get_basepairs()
self.length = len(dot_bracket)
def get_basepairs(self):
parenthesis = []
brackets = []
braces = []
rafters = []
basepairs = []
As = []
Bs = []
for i, c in enumerate(self.dbn):
if c == '(':
parenthesis.append(i)
if c == '[':
brackets.append(i)
if c == '{':
braces.append(i)
if c == '<':
rafters.append(i)
if c == 'A':
As.append(i)
if c == 'B':
Bs.append(i)
if c == '.':
continue
if c == ')':
basepairs.append((i, parenthesis.pop()))
if c == ']':
basepairs.append((i, brackets.pop()))
if c == '}':
basepairs.append((i, braces.pop()))
if c == '>':
basepairs.append((i, rafters.pop()))
if c == 'a':
basepairs.append((i, As.pop()))
if c == 'b':
basepairs.append((i, Bs.pop()))
return basepairs
def get_MCC_with(self, reference_structure):
# Get true and false positives and negatives
tp = 0
fp = 0
tn = 0
fn = 0
for bp in reference_structure.basepair_list:
if bp in self.basepair_list:
tp += 1
else:
fn += 1
for bp in self.basepair_list:
if bp not in reference_structure.basepair_list:
fp += 1
tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
# Compute MCC
if (tp+fp == 0):
print("We have an issue : no positives detected ! (linear structure)")
return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
def get_MCC_ctc_with(self, reference_structure):
# Get true and false positives and negatives
tp = 0
fp = 0
tn = 0
fn = 0
prediction = self.ctc
true_ctc = reference_structure.ctc
for i in range(len(true_ctc)):
if true_ctc[i] == '*' and prediction[i] == '*':
tp += 1
elif true_ctc[i] == '.' and prediction[i] == '.':
tn += 1
elif true_ctc[i] == '.' and prediction[i] == '*':
fp += 1
elif true_ctc[i] == '*' and prediction[i] == '.':
fn += 1
# print(str(tp) + " " + str(tn) + " " + str(fp) + " " + str(fn) + "\n")
result = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
# Compute MCC
if ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0):
print("warning: division by zero!")
return None
elif (tp + fp == 0):
print("We have an issue : no positives detected ! (linear structure)")
return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
class Pareto:
def __init__(self, list_of_structs, reference):
self.predictions = list_of_structs
self.true_structure = reference
self.n_pred = len(list_of_structs)
self.max_obj1 = max([s.objectives[0] for s in self.predictions ])
self.max_obj2 = max([s.objectives[1] for s in self.predictions ])
self.index_of_best = self.find_best_solution()
self.index_of_best_ctc = self.find_best_solution_ctc()
def find_best_solution(self):
# returns the index of the solution of the Pareto set which is the closest
# to the real 2D structure (the one with the max MCC)
max_i = -1
max_mcc = -1
for i,s in enumerate(self.predictions):
mcc = s.get_MCC_with(self.true_structure)
if mcc > max_mcc:
max_mcc = mcc
max_i = i
print("\n" + "max mcc str: " + str(max_mcc))
return max_i
def find_best_solution_ctc(self):
# returns the index of the solution of the Pareto set which is the closest
# to the real contacts area (the one with the max MCC)
max_i = -1
max_mcc = -1
for i,s in enumerate(self.predictions):
mcc = s.get_MCC_ctc_with(self.true_structure)
if mcc is None:
continue
elif mcc > max_mcc:
max_mcc = mcc
max_i = i
return max_i
def get_normalized_coords(self):
# retrieves the objective values of the best solution and normalizes them
coords = self.predictions[self.index_of_best].objectives
if self.max_obj1: # avoid divide by zero if all solutions are 0
x = coords[0]/self.max_obj1
else:
x = 0.5
if self.max_obj2: # avoid divide by zero if all solutions are 0
y = coords[1]/self.max_obj2
else:
y = 0.5
return ( x,y )
def get_normalized_coords_ctc(self):
CRED = '\033[91m'
CEND = '\033[0m'
CGREEN = '\33[32m'
CBLUE = '\33[34m'
# retrieves the objective values of the best solution and normalizes them
coords = self.predictions[self.index_of_best_ctc].objectives
if self.max_obj1: # avoid divide by zero if all solutions are 0
x = coords[0]/self.max_obj1
else:
x = 0.5
"""if(x < 0.5):
print("\n" + CRED + self.predictions[self.index_of_best_ctc].name + CEND)
print(CRED + self.predictions[self.index_of_best_ctc].ctc + CEND)
print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*")))
print(CRED + self.true_structure.ctc + CEND)
print("count: " + str(self.true_structure.ctc.count("*")) + "\n")
elif(x >= 0.5 and type(self.predictions[self.index_of_best_ctc].ctc)) is str:
print("\n" + CGREEN + self.predictions[self.index_of_best_ctc].name + CEND)
print(CGREEN + self.predictions[self.index_of_best_ctc].ctc + CEND)
print("count: " + str(self.predictions[self.index_of_best_ctc].ctc.count("*")))
print(CGREEN + self.true_structure.ctc + CEND)
print("count: " + str(self.true_structure.ctc.count("*")) + "\n")"""
if self.max_obj2: # avoid divide by zero if all solutions are 0
y = coords[1]/self.max_obj2
else:
y = 0.5
return ( x,y )
class RNA:
def __init__(self, filename, header, seq, struct, contacts):
self.seq_ = seq
self.header_ = header
self.struct_ = struct
self.contacts_ = contacts
self.basename_ = filename
ignored_nt_dict = {}
def is_canonical_nts(seq):
for c in seq[:-1]:
if c not in "ACGU":
if c in ignored_nt_dict.keys():
ignored_nt_dict[c] += 1
else:
ignored_nt_dict[c] = 1
return False
return True
def is_canonical_bps(struct):
if "()" in struct:
return False
if "(.)" in struct:
return False
if "(..)" in struct:
return False
if "[]" in struct:
return False
if "[.]" in struct:
return False
if "[..]" in struct:
return False
return True
def load_from_dbn(file, header_style=1):
container = []
counter = 0
db = open(file, "r")
c = 0
header = ""
seq = ""
struct = ""
while True:
l = db.readline()
if l == "":
break
c += 1
c = c % 4
if c == 1:
header = l[:-1]
if c == 2:
seq = l[:-1].upper()
if c == 3:
struct = l[:-1]
n = len(seq)
if c == 0:
contacts = l[:-1]
if is_canonical_nts(seq) and is_canonical_bps(struct):
if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct, contacts))
if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct, contacts))
if '[' in struct: counter += 1
db.close()
return container, counter
def parse_biokop(folder, basename, ext=".biok"):
solutions = []
if os.path.isfile(os.path.join(folder, basename + ext)):
rna = open(os.path.join(folder, basename + ext), "r")
lines = rna.readlines()
rna.close()
different_2ds = []
for s in lines[1:]:
if s == '\n':
continue
splitted = s.split('\t')
db2d = splitted[0]
if db2d not in different_2ds:
different_2ds.append(db2d)
# here is a negative sign because Biokop actually minimizes -MEA instead
# of maximizing MEA : we switch back to MEA
solutions.append(SecStruct(basename, db2d, -float(splitted[1]), -float(splitted[2][:-1])))
# check the range of MEA in this pareto set
min_mea = solutions[0].objectives[1]
max_mea = min_mea
for s in solutions:
mea = s.objectives[1]
if mea < min_mea:
min_mea = mea
if mea > max_mea:
max_mea = mea
# normalize so the minimum MEA of the set is 0
for i in range(len(solutions)):
solutions[i].objectives[1] -= min_mea
if len(different_2ds) > 1:
return solutions
else:
print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D solution is found.\033[0m" % (basename))
else:
print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename))
def parse_biorseo(folder, basename, ext):
solutions = []
print(basename + ext)
if os.path.isfile(os.path.join(folder, basename + ext)):
rna = open(os.path.join(folder, basename + ext), "r")
lines = rna.readlines()
rna.close()
different_2ds = []
contacts = []
str2d = []
count = 0;
for s in lines[2:]:
count = count + 1
if s == '\n':
continue
splitted = s.split('\t')
if(count % 2 == 1):
obj1 = float(splitted[1])
obj2 = float(splitted[2][:-1])
db2d = splitted[0].split(' ')[0]
if db2d not in different_2ds:
if(s.find('(') != -1):
different_2ds.append(db2d)
if(s.find('*') != -1):
contacts = db2d
solutions.append(SecStruct(basename, str2d, contacts, obj1, obj2))
elif(s.find('(') != -1):
str2d = db2d
if len(different_2ds) > 1:
return solutions
else:
print("[%s] \033[36mWARNING: ignoring this RNA, only one 2D or contacts solution is found.\033[0m" % (basename))
else:
print("[%s] \033[36mWARNING: file not found !\033[0m" % (basename))
return None
def prettify_biorseo(code):
name = ""
if "bgsu" in code:
name += "RNA 3D Motif Atlas + "
elif "json" in code:
name += "Motifs d'Isaure + Direct P.M"
else:
name += "Rna3Dmotifs + "
if "raw" in code:
name += "Direct P.M."
if "byp" in code:
name += "BPairing"
if "jar3d" in code:
name += "Jar3d"
# name += " + $f_{1" + code[-1] + "}$"
return name
# Parse options
try:
opts, args = getopt.getopt( sys.argv[1:], "",
[ "json_pmE",
"json_pmF",
"folder=",
"database=",
"output="
])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
results_folder = "."
extension = "all"
outputf = ""
for opt, arg in opts:
if opt == "--biokop":
extension = ".biok"
parse = parse_biokop
elif opt == "--folder":
results_folder = arg
elif opt == "--database":
database = arg
elif opt == "--output":
outputf = arg
else:
extension = '.' + opt[2:]
parse = parse_biorseo
RNAcontainer, _ = load_from_dbn(database)
if results_folder[-1] != '/':
results_folder = results_folder + '/'
if outputf == "":
outputf = results_folder
if outputf[-1] != '/':
outputf = outputf + '/'
def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"):
points = []
sizes = []
for rna in RNAcontainer:
# Extracting the predictions from the results file
solutions = parse(results_folder, rna.basename_, ext)
reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf"))
if solutions is None:
continue
pset = Pareto(solutions, reference)
points.append(pset.get_normalized_coords())
sizes.append(pset.n_pred)
print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
print("Loaded %d points on %d." % (len(points), len(RNAcontainer)))
x = np.array([ p[0] for p in points ])
y = np.array([ p[1] for p in points ])
xmin, xmax = 0, 1
ymin, ymax = 0, 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
values = np.vstack([x, y])
kernel = st.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xx.shape)
ax[pos].axhline(y=0, alpha=0.2, color='black')
ax[pos].axhline(y=1, alpha=0.2, color='black')
ax[pos].axvline(x=0, alpha=0.2, color='black')
ax[pos].axvline(x=1, alpha=0.2, color='black')
ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
ax[pos].scatter(x, y, s=25, alpha=0.1)
ax[pos].set_xlim((-0.1,1.1))
ax[pos].set_ylim((-0.1,1.1))
ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08, 0.15))
ax[pos].set_xlabel(xlabel)
ax[pos].set_ylabel(ylabel)
if nsolutions:
ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
ax[pos+1].set_xlim((0,max(sizes)+2))
ax[pos+1].set_xticks(range(0, max(sizes), 10))
ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
ax[pos+1].set_xlabel("# solutions")
ax[pos+1].set_ylabel("# RNAs")
def process_extension_ctc(ax, pos, ext, nsolutions=False, xlabel="Best solution performs\nwell on obj1", ylabel="Best solution performs\n well on obj2"):
points = []
sizes = []
for rna in RNAcontainer:
# Extracting the predictions from the results file
solutions = parse(results_folder, rna.basename_, ext)
reference = SecStruct(rna.basename_, rna.struct_, rna.contacts_, float("inf"), float("inf"))
if solutions is None:
continue
pset = Pareto(solutions, reference)
points.append(pset.get_normalized_coords_ctc())
sizes.append(pset.n_pred)
print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
print("Loaded %d points on %d." % (len(points), len(RNAcontainer)))
x = np.array([ p[0] for p in points ])
y = np.array([ p[1] for p in points ])
xmin, xmax = 0, 1
ymin, ymax = 0, 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
values = np.vstack([x, y])
kernel = st.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xx.shape)
ax[pos].axhline(y=0, alpha=0.2, color='black')
ax[pos].axhline(y=1, alpha=0.2, color='black')
ax[pos].axvline(x=0, alpha=0.2, color='black')
ax[pos].axvline(x=1, alpha=0.2, color='black')
ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
ax[pos].scatter(x, y, s=25, alpha=0.1)
ax[pos].set_xlim((-0.1,1.1))
ax[pos].set_ylim((-0.1,1.1))
ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer))+" RNAs)", (0.08,0.15))
ax[pos].set_xlabel(xlabel)
ax[pos].set_ylabel(ylabel)
if nsolutions:
ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
ax[pos+1].set_xlim((0,max(sizes)+2))
ax[pos+1].set_xticks(range(0, max(sizes), 10))
ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
ax[pos+1].set_xlabel("# solutions")
ax[pos+1].set_ylabel("# RNAs")
if extension == "all":
parse = parse_biorseo
fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
ax = ax.flatten()
process_extension(ax, 0, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA")
print("--------------------------------------------------------------------------------------------")
process_extension_ctc(ax, 1, ".json_pmF_MEA", xlabel="Normalized $f_{1E}$", ylabel="Normalized MEA")
print("--------------------------------------------------------------------------------------------")
for a in ax:
a.label_outer()
plt.subplots_adjust(bottom=0.2, top=0.9, left=0.07, right=0.98, hspace=0.05, wspace=0.05)
plt.savefig("pareto_visualizer_json_MEA_functionF.png")
else:
fig, ax = plt.subplots(2,1, figsize=(6,5))
plt.subplots_adjust(bottom=0.12, top=0.9, left=0.15, right=0.9, hspace=0.4)
if extension == ".biok":
process_extension(ax, 0, extension, nsolutions=True, xlabel="Normalized MFE", ylabel="Normalized MFE")
else:
process_extension(ax, 0, extension, nsolutions=False)
plt.savefig("pareto_visualizer_ext.png")
\ No newline at end of file