Louis BECQUEY

Big cleanup for JSON format support

import time
import subprocess
import os
import os.path
from math import sqrt, ceil
import numpy as np
import matplotlib.pyplot as plt
log_path = "test.log"
log = open(log_path, 'a')
def run_test(cmd, log):
log.write(time.asctime(time.localtime(time.time())) + " : Run process \"" + cmd + "\"\n")
log.flush()
process = subprocess.Popen(cmd.split(' ') ,shell=False,stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Poll process.stdout to show stdout live
while process.poll() is None:
output = process.stdout.readline()
if output:
log.write(output.decode())
log.flush()
rc = process.poll()
#create the command line to run BiORSEO with the modules library of CaRNAval
def create_command_rin(path, name, function, estimator):
cmd = ("python3 " + path + "/biorseo.py -i " +
path + "/data/fasta/" +
name + ".fa " +
"-O results/ " +
"--carnaval " +
"--patternmatch " +
"--func " + function + " --" + estimator + " -v " +
" --biorseo-dir " + path + " " +
"--modules-path " + path + "/data/modules/RIN/Subfiles")
return cmd
#create the command line to run BiORSEO with the modules library of RNA3dMotifs Atlas
def create_command_bgsu(path, name, function, estimator):
cmd = ("python3 " + path + "/biorseo.py -i " +
path + "/data/fasta/" +
name + ".fa " +
"-O results/ " +
"--3dmotifatlas " +
"--jar3d " +
"--func " + function + " --" + estimator + " -v " +
"--jar3d-exec " + " /local/local/localopt/jar3d_2014-12-11.jar" +
" --biorseo-dir " + path + " " +
"--modules-path " + path + "/data/modules/BGSU")
return cmd
#create the command line to run BiORSEO with the modules library of RNA3dMotifs
def create_command_desc(path, name, function, estimator):
cmd = ("python3 " + path + "/biorseo.py -i " +
path + "/data/fasta/" +
name + ".fa " +
"-O results/ " +
"--rna3dmotifs " +
"--patternmatch " +
"--func " + function + " --" + estimator + " -v " +
" --biorseo-dir " + path + " " +
"--modules-path " + path + "/data/modules/DESC")
return cmd
#create the command line to run BiORSEO with the motifs library of Isaure in .json
def create_command_isaure(path, name, function, estimator):
cmd = ("python3 " + path + "/biorseo.py -i " +
path + "/data/fasta/" +
name + ".fa " +
"-O results/ " +
"--contacts " +
"--patternmatch " +
"--func " + function + " --" + estimator +
" --biorseo-dir " + path + " " +
"--modules-path " + path + "/data/modules/ISAURE/bibliotheque_a_lire")
return cmd
#execute the command line correspondin to the information put in the argument.
def execute_command(path, function, estimator, true_ctc, true_str, list_ctc, list_str, modules):
if (modules == 'desc'):
cmd = create_command_desc(path, name, function, estimator)
elif (modules == 'bgsu'):
cmd = create_command_bgsu(path, name, function, estimator)
elif (modules == 'rin'):
cmd = create_command_rin(path, name, function, estimator)
elif (modules == 'isaure'):
cmd = create_command_isaure(path, name, function, estimator)
os.system(cmd)
"""file_path = "results/test_" + name + ".json_pm" + function + "_" + estimator
if os.path.isfile(file_path):
tab = write_mcc_in_file(name, true_ctc, true_str, estimator, function)
list_ctc.append(tab[0])
list_str.append(tab[1])"""
#Retrieves the list of structures predicted by Biorseo for each sequence in the benchmark.txt file
def get_list_str_by_seq(name, estimator, function, list_str, true_str, modules):
if modules == 'bgsu':
extension = ".jar3d"
elif modules == 'desc':
extension = ".desc_pm"
elif modules == 'rin':
extension = ".rin_pm"
elif modules == 'json':
extension = ".json_pm"
file_path = "results/test_" + name + extension + function + "_" + estimator
if os.path.isfile(file_path):
path_benchmark = "data/modules/ISAURE/benchmark.txt"
max_mcc = get_mcc_structs_max(path_benchmark, name, estimator, function, extension, modules, true_str)
list_str.append(max_mcc)
# ================== Code from Louis Becquey Benchark.py ==============================
def dbn_to_basepairs(structure):
parenthesis = []
brackets = []
braces = []
rafters = []
basepairs = []
As = []
Bs = []
for i, c in enumerate(structure):
if c == '(':
parenthesis.append(i)
if c == '[':
brackets.append(i)
if c == '{':
braces.append(i)
if c == '<':
rafters.append(i)
if c == 'A':
As.append(i)
if c == 'B':
Bs.append(i)
if c == '.':
continue
if c == ')':
basepairs.append((i, parenthesis.pop()))
if c == ']':
basepairs.append((i, brackets.pop()))
if c == '}':
basepairs.append((i, braces.pop()))
if c == '>':
basepairs.append((i, rafters.pop()))
if c == 'a':
basepairs.append((i, As.pop()))
if c == 'b':
basepairs.append((i, Bs.pop()))
return basepairs
def compare_two_contacts(true_ctc, prediction):
tp = 0
fp = 0
tn = 0
fn = 0
for i in range(len(true_ctc)):
if true_ctc[i] == '*' and prediction[i] == '*':
tp += 1
elif true_ctc[i] == '.' and prediction[i] == '.':
tn += 1
elif true_ctc[i] == '.' and prediction[i] == '*':
fp += 1
elif true_ctc[i] == '*' and prediction[i] == '.':
fn += 1
return [tp, tn, fp, fn]
def compare_two_structures(true2d, prediction):
true_basepairs = dbn_to_basepairs(true2d)
pred_basepairs = dbn_to_basepairs(prediction)
tp = 0
fp = 0
tn = 0
fn = 0
for bp in true_basepairs:
if bp in pred_basepairs:
tp += 1
else:
fn += 1
for bp in pred_basepairs:
if bp not in true_basepairs:
fp += 1
tn = len(true2d) * (len(true2d) - 1) * 0.5 - fp - fn - tp
return [tp, tn, fp, fn]
def mattews_corr_coeff(tp, tn, fp, fn):
if ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0):
#print("warning: division by zero! no contact in the prediction")
#print("tp: " + str(tp) + " fp: " + str(fp) + " tn: " + str(tn) + " fn: " + str(fn))
return -1
elif (tp + fp == 0):
print("We have an issue : no positives detected ! (linear structure)")
return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
def f1_score(tp, tn, fp, fn):
return 2 * tp / (2 * tp + fp + fn)
def specificity(tp, tn, fp, fn):
return tn / (tn + fp)
# ================== Code from Louis Becquey Benchark.py ==============================
#Get the best MCC value for all prediction of the results file of the sequence in argument
def get_mcc_structs_max(path_benchmark, sequence_id, estimator, function, extension, modules, true_structure):
read_prd = open("results/test_" + sequence_id + extension + function + "_" + estimator, "r")
write = open("results/test_" + sequence_id + ".mcc_" + function + "_" + estimator + "_" + modules, "w")
max_mcc_str = -1;
title_exp = ">test_" + sequence_id + ": "
write.write(title_exp)
structure_exp = true_structure
write.write("structure 2d attendue:\n" + structure_exp + "\n")
title_prd = read_prd.readline()
structure_prd = read_prd.readline()
sequence_prd = structure_prd
while structure_prd:
structure_prd = read_prd.readline()
if (len(structure_prd) != 0):
write.write("\nstructure 2d predite:\n" + structure_prd[:len(sequence_prd)] + "\n")
mcc_tab = compare_two_structures(structure_exp, structure_prd[:len(sequence_prd)])
mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
if (max_mcc_str < mcc_str):
max_mcc_str = mcc_str
write.write("mcc: " + str(mcc_str) + "\n")
contacts_prd = read_prd.readline()
write.write("max mcc 2D:" + str(max_mcc_str))
read_prd.close()
write.close()
return max_mcc_str
#Create a file that store the information concerning the MCC value obtains for each prediction of the
#sequence in input
def write_mcc_in_file(sequence_id, true_contacts, true_structure, estimator, function):
read_prd = open("results/test_" + sequence_id + ".json_pm" + function + "_" + estimator, "r")
write = open("results/test_" + sequence_id + ".mcc_" + function + "_" + estimator, "w")
max_mcc_str = -1;
max_mcc_ctc = -1;
title_exp = ">test_" + sequence_id + ": "
write.write(title_exp)
contacts_exp = true_contacts
structure_exp = true_structure
write.write("structure 2d attendue:\n" + structure_exp + "\n")
write.write("contacts attendus:\n" + contacts_exp + "\n" + len(structure_exp) * "-")
title_prd = read_prd.readline()
structure_prd = read_prd.readline()
sequence_prd = structure_prd
while structure_prd:
structure_prd = read_prd.readline()
if (len(structure_prd) != 0):
write.write("\nstructure 2d predite:\n" + structure_prd[:len(sequence_prd)] + "\n")
mcc_tab = compare_two_structures(structure_exp, structure_prd[:len(sequence_prd)])
mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
if (max_mcc_str < mcc_str):
max_mcc_str = mcc_str
write.write("mcc: " + str(mcc_str) + "\n")
contacts_prd = read_prd.readline()
write.write("\ncontacts predits:\n" + contacts_prd)
if (len(contacts_prd) == len(contacts_exp)):
mcc_tab = compare_two_contacts(contacts_exp, contacts_prd)
mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
if (max_mcc_ctc < mcc_ctc):
max_mcc_ctc = mcc_ctc
write.write("mcc: " + str(mcc_ctc) + "\n\n")
else:
write.write("mcc: no expected contacts sequence or not same length between expected and predicted\n\n")
write.write("max mcc 2D:" + str(max_mcc_str))
write.write("max mcc ctc:" + str(max_mcc_ctc))
read_prd.close()
write.close()
return [max_mcc_ctc, max_mcc_str]
def set_axis_style(ax, labels):
ax.xaxis.set_tick_params(direction='out')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels)
ax.set_xlim(0.25, len(labels) + 0.75)
ax.set_xlabel('Sample name')
#Create one violin plot to see the distribution of the best MCC (of the base pairing)
def visualization_best_mcc_str(list_struct2d, estimator, function, modules, color, lines_color):
print(estimator + " + " + function + ": ")
np_struct2d = np.array(list_struct2d)
data_to_plot = np_struct2d
median_2d = np.median(np_struct2d)
print("mediane 2D: " + str(median_2d) + "\n")
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
labels = ['structure 2D']
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels)
ax.set_xlabel(function)
ax.set_ylabel('MCC')
violins = ax.violinplot(data_to_plot, showmedians=True)
for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
vp = violins[partname]
vp.set_edgecolor(lines_color)
vp.set_linewidth(1)
for v in violins['bodies']:
v.set_facecolor(color)
plt.savefig('visualisation_28_09' + estimator + '_' + function + '_' + modules + '.png', bbox_inches='tight')
#Create 4 violin plot to see the distribution of the best MCC (of the base pairing)
def visualization_best_mcc_str_4_figures(list_struct2d, color, lines_color):
np_struct2d_1 = np.array(list_struct2d[0])
np_struct2d_2 = np.array(list_struct2d[1])
np_struct2d_3 = np.array(list_struct2d[2])
np_struct2d_4 = np.array(list_struct2d[3])
data_to_plot = [np_struct2d_1, np_struct2d_2, np_struct2d_3, np_struct2d_4]
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = fig.add_axes([0, 0, 1, 1])
labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F']
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels)
#ax.set_xlim(0.25, len(labels) + 0.75)
ax.set_xlabel("motifs d'Isaure")
ax.set_ylabel('MCC (en fonction des appariements)')
violins = ax.violinplot(data_to_plot, showmedians=True)
for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
vp = violins[partname]
vp.set_edgecolor(lines_color)
vp.set_linewidth(1)
for v in violins['bodies']:
v.set_facecolor(color)
plt.savefig('visualisation_28_09_Isaure_E_F.png', dpi=200, bbox_inches='tight')
#Create 2 violin plot to see the distribution of the best MCC (one for the MCC of
# the base pairing and one for the contacts)
def visualization_best_mcc(list_struct2d, list_contacts, estimator, function, modules, color, lines_color):
print(estimator + " + " + function + ": ")
np_struct2d = np.array(list_struct2d)
np_contacts = np.array(list_contacts)
data_to_plot = [np_struct2d, np_contacts]
median_2d = np.median(np_struct2d)
median_ctc = np.median(np_contacts)
print("mediane 2D: " + str(median_2d) + "\n")
print("mediane ctc: " + str(median_ctc) + "\n")
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
labels = ['structure 2D', 'contacts']
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels)
ax.set_xlabel(function)
ax.set_ylabel('MCC')
violins = ax.violinplot(data_to_plot, showmedians=True)
for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
vp = violins[partname]
vp.set_edgecolor(lines_color)
vp.set_linewidth(1)
for v in violins['bodies']:
v.set_facecolor(color)
plt.savefig('visualisation_16_06_' + estimator + '_' + function + '_' + modules + '.png', bbox_inches='tight')
#Return the list of names of all the sequence in the benchmark.txt, the list of structures and the list
# of contacts predicted in the result file of each sequence.
# This function is only use for the result in .json_pm format files
def get_list_structs_contacts_all(path_benchmark, estimator, function):
myfile = open(path_benchmark, "r")
list_name = []
complete_list_struct2d = []
complete_list_contacts = []
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
count = 0
while seq:
name = name[6:].strip()
count = count + 1
file_path = "results/test_" + name + ".json_pm" + function + "_" + estimator
if os.path.isfile(file_path):
file_result = open(file_path, "r")
list_struct2d = []
list_contacts = []
list_name.append(name)
title_prd = file_result.readline()
structure_prd = file_result.readline()
sequence = structure_prd
while structure_prd:
structure_prd = file_result.readline()
if (len(structure_prd) != 0):
mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
list_struct2d.append(mcc_str)
contacts_prd = file_result.readline()
if (len(contacts_prd) == len(contacts)):
mcc_tab = compare_two_contacts(contacts, contacts_prd)
mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
list_contacts.append(mcc_ctc)
complete_list_struct2d.append(list_struct2d)
complete_list_contacts.append(list_contacts)
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
return [list_name, complete_list_struct2d, complete_list_contacts]
myfile.close()
#Return the list of names of all the sequence in the benchmark.txt, the list of structures
# predicted in the result file of each sequence.
def get_list_structs_all(path_benchmark, estimator, function, modules):
if modules == 'bgsu':
extension = ".jar3d"
elif modules == 'desc':
extension = ".desc_pm"
elif modules == 'rin':
extension = ".rin_pm"
elif modules == 'json':
extension = ".json_pm"
myfile = open(path_benchmark, "r")
list_name = []
complete_list_struct2d = []
complete_list_contacts = []
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
count = 0
while seq:
name = name[6:].strip()
count = count + 1
file_path = "results/test_" + name + extension + function + "_" + estimator
if os.path.isfile(file_path):
file_result = open(file_path, "r")
list_struct2d = []
list_name.append(name)
title_prd = file_result.readline()
structure_prd = file_result.readline()
sequence = structure_prd
while structure_prd:
structure_prd = file_result.readline()
if (len(structure_prd) != 0):
mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
list_struct2d.append(mcc_str)
contacts_prd = file_result.readline()
complete_list_struct2d.append(list_struct2d)
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
return [list_name, complete_list_struct2d]
myfile.close()
#Return the list in argument in two lists, each list containing half of the list in argument
def get_half(list):
first_half = []
second_half = []
if (len(list) % 2 == 0):
middle = len(list) / 2
else:
middle = len(list) / 2 + 0.5
for i in range (int(middle)):
first_half.append(list[i])
for i in range (int(middle)):
if i + int(middle) < len(list):
second_half.append(list[i + int(middle)])
return [first_half, second_half]
#Create a boxplot with all the MCC (for the base pairing) obtains with all the prediction for each sequence,
# divide in two figures.
def visualization_all_mcc_str(path_benchmark, estimator, function, modules):
list_name = get_list_structs_all(path_benchmark, estimator, function, modules)[0]
tab_struct2d = get_list_structs_all(path_benchmark, estimator, function, modules)[1]
min = 20
max = 0
max_i = 0
min_i = 0
for i in range(len(tab_struct2d)):
if (len(tab_struct2d[i]) > max):
max = len(tab_struct2d[i])
max_i = i
if (len(tab_struct2d[i]) < min):
min = len(tab_struct2d[i])
min_i = i
print("max: " + list_name[max_i] + " " + str(max) + " min: " + list_name[min_i] + " " + str(min) + "\n")
np_struct2d = np.array(tab_struct2d)
size = len(tab_struct2d)
list_median_str = []
for i in range(size):
list_median_str.append(np.median(np_struct2d[i]))
all_str = []
for i in range(size):
for j in range(len(np_struct2d[i])):
all_str.append(np_struct2d[i][j])
"""print("mediane struct" + estimator + " + " + function + " : " + str(np.median(all_str)))
print("ecart struct" + estimator + " + " + function + " : " + str(np.std(all_str)) + "\n")"""
data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
if (len(data) % 2 == 0):
absciss = len(data) / 2
else:
absciss = len(data) / 2 + 0.5
divide_tab_name = get_half(boxName)
divide_tab_data = get_half(data)
plt.figure(figsize=(15,4),dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
for i in range(int(absciss)):
y =data[i]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC (appariements)')
plt.savefig('visualisation_all_mcc_' + estimator + "_" + function + "_" + modules + '.png', bbox_inches='tight')
plt.figure(figsize=(15, 4), dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
for i in range(len(data)):
if i + int(absciss) < len(data):
y = data[i + int(absciss)]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC')
plt.savefig('visualisation_all_mcc_' + estimator + "_" + function + "_" + modules + '_2.png', bbox_inches='tight')
# Create a boxplot with all the MCC (for the base pairing and the contacts) obtains with all
# the prediction for each sequence, divide in two figures.
def visualization_all_mcc(path_benchmark, estimator, function, modules):
list_name = get_list_structs_contacts_all(path_benchmark, estimator, function)[0]
tab_struct2d = get_list_structs_contacts_all(path_benchmark, estimator, function)[1]
tab_contacts = get_list_structs_contacts_all(path_benchmark, estimator, function)[2]
min = 20
max = 0
max_i = 0
min_i = 0
for i in range(len(tab_struct2d)):
if (len(tab_struct2d[i]) > max):
max = len(tab_struct2d[i])
max_i = i
if (len(tab_struct2d[i]) < min):
min = len(tab_struct2d[i])
min_i = i
print("max: " + list_name[max_i] + " " + str(max) + " min: " + list_name[min_i] + " " + str(min) + "\n")
np_struct2d = np.array(tab_struct2d)
size = len(tab_struct2d)
list_median_str = []
for i in range(size):
list_median_str.append(np.median(np_struct2d[i]))
all_str = []
for i in range(size):
for j in range(len(np_struct2d[i])):
all_str.append(np_struct2d[i][j])
"""print("mediane struct" + estimator + " + " + function + " : " + str(np.median(all_str)))
print("ecart struct" + estimator + " + " + function + " : " + str(np.std(all_str)) + "\n")"""
data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
if (len(data) % 2 == 0):
absciss = len(data) / 2
else:
absciss = len(data) / 2 + 0.5
divide_tab_name = get_half(boxName)
divide_tab_data = get_half(data)
plt.figure(figsize=(15,4),dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
for i in range(int(absciss)):
y =data[i]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC (appariements)')
plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + "_" + modules + '.png', bbox_inches='tight')
plt.figure(figsize=(15, 4), dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
for i in range(len(data)):
if i + int(absciss) < len(data):
y = data[i + int(absciss)]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC')
plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + "_" + modules + '_2.png', bbox_inches='tight')
np_contacts = np.array(tab_contacts)
size = len(tab_contacts)
list_median_ctc = []
for i in range(size):
list_median_ctc.append(np.median(np_contacts[i]))
all_ctc = []
for i in range(size):
for j in range(len(np_contacts[i])):
all_ctc.append(np_contacts[i][j])
"""print("mediane ctc" + estimator + " + " + function + " : " + str(np.median(all_ctc)))
print("ecart ctc" + estimator + " + " + function + " : " + str(np.std(all_ctc)) + "\n")"""
data = [x for _, x in sorted(zip(list_median_ctc, tab_contacts))]
boxName = [x for _, x in sorted(zip(list_median_ctc, list_name))]
if (len(data) % 2 == 0) :
absciss = len(data)/2
else :
absciss = len(data)/2 + 0.5
divide_tab_name = get_half(boxName)
divide_tab_data = get_half(data)
plt.figure(figsize=(15, 4), dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
for i in range(int(absciss)):
y = data[i]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC (contacts)')
plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '.png', bbox_inches='tight')
plt.figure(figsize=(15, 4), dpi=200)
plt.xticks(rotation=90)
plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
for i in range(len(data)):
if i + int(absciss) < len(data) :
y = data[i + int(absciss)]
x = np.random.normal(1 + i, 0.04, size=len(y))
plt.scatter(x, y)
plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
plt.xlabel('nom de la séquence')
plt.ylabel('MCC')
plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '_2.png', bbox_inches='tight')
# Most of those commands lines and this script work only for the benchmark_16-07-2021.json
# file provides by Isaure Chauvot de Beauchene. This script is means to automize the execution
# of BiORSEO for each sequence and the creation of figures.
# after compiling create_file.cpp in 'scripts' directory (compiling once is enough) :
# clang++ create_files.cpp -o create
#cmd = ("scripts/create")
# after compiling create_file.cpp in 'scripts' directory (compiling once is enough) :
# clang++ add_delimiter.cpp -o addDelimiter
#cmd0 = ("cppsrc/Scripts/addDelimiter")
# after compiling count_pattern.cpp in 'scripts' directory (compiling once is enough) :
# clang++ count_pattern.cpp -o countPattern
#cmd1 = ("cppsrc/Scripts/countPattern")
myfile = open("data/modules/ISAURE/benchmark.txt", "r")
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
list_struct2d = [[],[],[],[]]
# source path to the directory (for my computer)
path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo"
path2 = "/local/local/BiorseoNath"
while seq:
name = name[6:].strip()
print(name)
# after compiling delete_same_pdb.cpp in 'scripts' directory (compiling once is enough) :
# clang++ delete_same_pdb.cpp -o deletePdb
#cmd2 = ("cppsrc/Scripts/deletePdb " + name)
#os.system(cmd2)
"""get_list_str_by_seq(name, 'MEA', 'E', list_struct2d[0], structure2d, 'json')
get_list_str_by_seq(name, 'MEA', 'F', list_struct2d[1], structure2d, 'json')
get_list_str_by_seq(name, 'MFE', 'E', list_struct2d[2], structure2d, 'json')
get_list_str_by_seq(name, 'MFE', 'F', list_struct2d[3], structure2d, 'json')"""
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
#visualization_best_mcc_str_4_figures(list_struct2d, 'red', '#900C3F')
"""visualization_best_mcc(list_struct2d_A_MFE, list_contacts_A_MFE, 'MFE', 'A', 'rin', 'red', '#900C3F')
visualization_best_mcc(list_struct2d_B_MFE, list_contacts_B_MFE, 'MFE', 'B', 'rin', 'blue', '#0900FF')
visualization_best_mcc(list_struct2d_A_MEA, list_contacts_A_MEA, 'MEA', 'A', 'rin', 'red', '#900C3F')
visualization_best_mcc(list_struct2d_B_MEA, list_contacts_B_MEA, 'MEA', 'B', 'rin', 'blue', '#0900FF')"""
myfile.close()
path_benchmark = "data/modules/ISAURE/benchmark.txt"
visualization_all_mcc_str(path_benchmark, 'MEA', 'C', 'bgsu')
visualization_all_mcc_str(path_benchmark, 'MEA', 'D', 'bgsu')
visualization_all_mcc_str(path_benchmark, 'MFE', 'C', 'bgsu')
visualization_all_mcc_str(path_benchmark, 'MFE', 'D', 'bgsu')
\ No newline at end of file
......@@ -40,12 +40,9 @@ re: remove clean all
.PHONY: clean
clean:
$(rm) $(OBJECTS)
$(rm) doc/supplementary_material.bbl doc/supplementary_material.blg doc/supplementary_material.synctex.gz doc/supplementary_material.log doc/supplementary_material.aux
$(rm) doc/main_bioinformatics.bbl doc/main_bioinformatics.blg doc/main_bioinformatics.synctex.gz doc/main_bioinformatics.log doc/main_bioinformatics.aux doc/OUP_First_SBk_Bot_8401-eps-converted-to.pdf
@echo -e "\033[00;32mCleanup completed.\033[00m"
.PHONY: remove
remove:
@$(rm) $(BINDIR)/$(TARGET)
@$(rm) doc/main_bioinformatics.pdf doc/supplementary_material.pdf
@echo -e "\033[00;32mExecutable and docs removed!\033[00m"
......
......@@ -59,7 +59,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
{
if (!exists(source_path))
{
cerr << "!!! Hmh, i can't find that folder: " << source_path << endl;
cerr << "ERR: Hmh, i can't find this: " << source_path << endl;
exit(EXIT_FAILURE);
}
......@@ -112,7 +112,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
if (verbose_) cout << "\t> Looking for insertion sites..." << endl;
if (source == "jar3dcsv" or source == "bayespaircsv")
if (source == "csvfile")
{
std::ifstream motifs;
string line;
......@@ -182,7 +182,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
accepted++;
if (is_desc_insertible(it.path().string(), rna_.get_seq()))
{
args_of_parallel_func args(it.path(), posInsertionSites_access);
file_and_mutex args(it.path(), posInsertionSites_access);
inserted++;
pool.push(bind(&MOIP::allowed_motifs_from_desc, this, args)); // & is necessary to get the pointer to a member function
}
......@@ -201,7 +201,6 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
{
mutex posInsertionSites_access;
Pool pool;
size_t inserted = 0;
size_t accepted = 0;
size_t errors = 0;
int num_threads = thread::hardware_concurrency() - 1;
......@@ -231,8 +230,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
continue;
}
accepted++;
args_of_parallel_func args(it.path(), posInsertionSites_access);
inserted++;
file_and_mutex args(it.path(), posInsertionSites_access);
pool.push(bind(&MOIP::allowed_motifs_from_rin, this, args)); // & is necessary to get the pointer to a member function
}
pool.done();
......@@ -241,58 +239,61 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
thread_pool.at(i).join();
if (verbose){
cout << "\t> " << inserted << " candidate RINs on " << accepted + errors << " (" << errors << " ignored motifs), " << endl;
cout << "\t " << insertion_sites_.size() << " insertion sites kept after applying probability threshold of " << theta << endl;
cout << "\t> " << insertion_sites_.size() << " candidate RINs on " << accepted + errors << " (" << errors << " ignored motifs), after applying probability threshold of " << theta << endl;
}
}
else if (source == "jsonfolder")
{
mutex posInsertionSites_access;
Pool pool;
size_t inserted = 0;
size_t accepted = 0;
size_t errors = 0;
mutex posInsertionSites_access;
Pool pool;
size_t accepted = 0;
size_t errors = 0;
int num_threads = thread::hardware_concurrency() - 1;
vector<thread> thread_pool;
std::ifstream motif;
// Read every JSON files
vector<pair<uint,char>> errors_id;
for (auto it : recursive_directory_range(source_path))
{
errors_id = Motif::is_valid_JSON(it.path().string());
if (!(errors_id.empty())) // Returns error if JSON file is incorrect
{
for(uint j = 0; j < errors_id.size(); j++)
// prepare a pool of threads to process motifs in parallel
for (int i = 0; i < num_threads; i++)
thread_pool.push_back(thread(&Pool::infinite_loop_func, &pool));
// Read every JSON entry and add it to the queue (iff valid)
motif = std::ifstream(source_path);
json js = json::parse(motif);
char error;
for(auto it = js.begin(); it != js.end(); ++it) {
if ((error = Motif::is_valid_JSON(it))) // Returns error if JSON entry is incorrect
{
if (verbose)
{
if(verbose) {
cerr << "\t> Ignoring JSON " << errors_id[j].first;
uint error = errors_id[j].second;
switch (error)
{
case 'l': cerr << ", too short to be considered."; break;
case 'x': cerr << ", sequence and secondary structure are of different size."; break;
case 'd' : cerr << ", missing header."; break;
case 'e' : cerr << ", sequence is empty."; break;
case 'f' : cerr << ", 2D is empty."; break;
case 'n' : cerr << ", brackets are not balanced."; break;
case 'k' : cerr << ", a component is too small and got removed."; break;
case 'a' : cerr << ", the number of components is different between contacts and sequence"; break;
case 'b' : cerr << ", the number of nucleotides is different between contacts and sequence"; break;
default: cerr << ", unknown reason";
}
cerr << endl;
cerr << "\t> Ignoring motif " << it.key(); // field key, i.e. motif identifier
switch (error)
{
case 'l': cerr << ", too short to be considered."; break;
case 'x': cerr << ", sequence and secondary structure are of different size."; break;
case 'e' : cerr << ", sequence is empty."; break;
case 'f' : cerr << ", 2D is empty."; break;
case 'r' : cerr << ", RNA sequence not recognized, only use ACGTU."; break;
case 'n' : cerr << ", brackets are not balanced."; break;
default: cerr << ", unknown reason";
}
cerr << endl;
}
errors++;
continue;
}
accepted++;
args_of_parallel_func args(it.path(), posInsertionSites_access);
inserted++;
allowed_motifs_from_json(args, errors_id);
}
if (verbose_){
cout << "\t " << insertion_sites_.size() << " insertion sites kept after applying probability threshold of " << theta << endl;
// add a new job to the pool, to run allowed_motifs_from_json(args)
motif_and_mutex args(it, posInsertionSites_access);
pool.push(bind(&MOIP::allowed_motifs_from_json, this, args)); // & is necessary to get the pointer to a member function
}
pool.done(); // we won't add new jobs
// Wait for jobs to complete
for (unsigned int i = 0; i < thread_pool.size(); i++)
thread_pool.at(i).join();
if (verbose_) cout << "\t> " << insertion_sites_.size() << " candidate motifs on " << accepted + errors << " (" << errors << " ignored motifs), after applying probability threshold of " << theta << endl;
}
else
{
......@@ -343,7 +344,6 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
obj1 = IloExpr(env_);
for (uint i = 0; i < insertion_sites_.size(); i++) {
IloNum sum_k = 0;
IloNum kx2_wx_ax = 0;
switch (obj_function_nbr_) {
case 'A':
......@@ -369,22 +369,9 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
obj1 += IloNum(insertion_sites_[i].comp.size() * insertion_sites_[i].score_ / log2(sum_k)) *
insertion_dv_[index_of_first_components[i]];
break;
case 'E':
// Fonction f1E
for (const Component& c : insertion_sites_[i].comp) sum_k += c.k;
obj1 += IloNum(sum_k * insertion_sites_[i].contact_ * insertion_sites_[i].tx_occurrences_) * insertion_dv_[index_of_first_components[i]] ;
break;
case 'F':
// Fonction f1F
for (const Component& c : insertion_sites_[i].comp) sum_k += c.k;
kx2_wx_ax += IloNum((sum_k * sum_k * insertion_sites_[i].tx_occurrences_) + insertion_sites_[i].contact_);
obj1 += insertion_dv_[index_of_first_components[i]] * IloNum(kx2_wx_ax);
break;
}
}
//Stacking energy parameter matrix
double energy[7][7] = {
{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
......@@ -396,8 +383,8 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
{0.0, 0.6, 1.4, 1.5, 0.5, 1.0, 0.3}
};
obj2 = IloExpr(env_);
switch (obj_function2_nbr_) {
obj2 = IloExpr(env_);
switch (obj_function2_nbr_) {
case 'a':
// Define the MFE (Minimum Free Energy):
for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
......@@ -409,7 +396,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
}
}
}
break;
break;
case 'b':
// Define the expected accuracy objective function:
//MEA:
......@@ -418,7 +405,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool
if (allowed_basepair(u, v)) obj2 += (IloNum(rna_.get_pij(u, v)) * y(u, v));
}
}
break;
break;
}
}
......@@ -909,7 +896,7 @@ bool MOIP::allowed_basepair(size_t u, size_t v) const
return true;
}
void MOIP::allowed_motifs_from_desc(args_of_parallel_func arg_struct)
void MOIP::allowed_motifs_from_desc(file_and_mutex arg_struct)
{
/*
Searches where to place some DESC module in the RNA
......@@ -1085,7 +1072,7 @@ void MOIP::allowed_motifs_from_desc(args_of_parallel_func arg_struct)
}
}
void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
void MOIP::allowed_motifs_from_rin(file_and_mutex arg_struct)
{
// Searches where to place some RINs in the RNA
......@@ -1125,6 +1112,7 @@ void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
for (vector<Component>& v : vresults)
{
// Now create an actual Motif with the class
Motif temp_motif = Motif(v, rinfile, carnaval_id, false);
bool unprobable = false;
......@@ -1142,114 +1130,71 @@ void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
}
}
void MOIP::allowed_motifs_from_json(args_of_parallel_func arg_struct, vector<pair<uint, char>> errors_id)
void MOIP::allowed_motifs_from_json(motif_and_mutex arg_struct)
{
// Searches where to place some JSON motifs in the RNA
path jsonfile = arg_struct.motif_file;
mutex& posInsertionSites_access = arg_struct.posInsertionSites_mutex;
std::ifstream motif;
string filepath = jsonfile.string();
vector<vector<Component>> vresults, r_vresults;
vector<string> component_sequences;
vector<string> component_contacts;
vector<string> pdbs;
string contacts, field, seq, struct2d;
string contacts_id;
string line, filenumber;
string rna = rna_.get_seq();
string reversed_rna = rna_.get_seq();
size_t nb_contacts = 0;
double tx_occurrences = 0;
mutex& posInsertionSites_access = arg_struct.posInsertionSites_mutex;
json_elem it = arg_struct.motif;
vector<vector<Component>> vresults;
vector<string> component_sequences;
string subseq, seq, fullseq, struct2d;
size_t end;
// Retrieve sequenc eand structure
seq = it.value()["sequence"];
fullseq = string(seq);
struct2d = it.value()["struct2d"];
transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
// Now split the sequence into components
while(seq.find("&") != string::npos) {
end = seq.find("&");
subseq = seq.substr(0, end);
seq = seq.substr(end + 1);
component_sequences.push_back(subseq); // new component sequence
}
if (!seq.empty()) {
component_sequences.push_back(seq);
}
std::reverse(reversed_rna.begin(), reversed_rna.end());
// Place the components by pattern-matching
vresults = find_next_ones_in(rna_.get_seq(), 0, component_sequences);
motif = std::ifstream(filepath);
json js = json::parse(motif);
// Check if the results are possible (likeliness)
for (vector<Component>& v : vresults)
{
// Now create proper Motifs with Motif class
Motif temp_motif = Motif(v, it.key(), struct2d);
string keys[5] = {"contacts", "occurences", "pdb", "sequence", "struct2d"};
uint it_errors = 0;
uint comp;
uint occ = 0;
for(auto it = js.begin(); it != js.end(); ++it) {
contacts_id = it.key();
comp = stoi(contacts_id);
// Check for known errors to ignore corresponding motifs
if (comp == errors_id[it_errors].first) {
while (comp == errors_id[it_errors].first) {
it_errors ++;
}
continue;
}
for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
field = it2.key();
if (!field.compare(keys[0])) // This is the contacts field
{
contacts = it2.value();
nb_contacts = count_contacts(contacts);
component_contacts = find_components(contacts, "&");
}
else if (!field.compare(keys[1])) // This is the occurences field
{
occ = it2.value();
tx_occurrences = (double)occ; // (double)max_occ;
if (verbose_) cout << "\t> Considering motif JSON " << temp_motif.pos_string() << "\t"
<< fullseq << ", " << struct2d << " ";
}
else if (!field.compare(keys[2])) // This is the pdb field
{
vector<string> tab = it2.value();
pdbs = tab;
}
else if (!field.compare(keys[3])) // This is the sequence field
{
seq = check_motif_sequence(it2.value());
component_sequences = find_components(seq, "&");
}
else if (!field.compare(keys[4])) // This is the struct2D field
{
struct2d = it2.value();
}
// Check if the motif can be inserted, checking the basepairs probabilities and theta
bool unprobable = false;
if (!temp_motif.links_.size()) {
if (verbose_) cout << "discarded, no constraints on the secondary structure, it is a useless motif." << endl;
continue;
}
vresults = json_find_next_ones_in(rna, 0, component_sequences);
for (vector<Component>& v : vresults)
if (verbose_) cout << " + links at positions ";
for (const Link& l : temp_motif.links_)
{
if (verbose_) cout << "\t> Considering motif JSON " << contacts_id << "\t" << seq << ", " << struct2d << " ";
Motif temp_motif = Motif(v, contacts_id, nb_contacts, tx_occurrences);
temp_motif.links_ = build_motif_pairs(struct2d, v);
temp_motif.pos_contacts = find_contacts(component_contacts, v);
// Check if the motif can be inserted, checking the basepairs probabilities and theta
bool unprobable = false;
if (!temp_motif.links_.size()) {
if (verbose_) cout << "discarded, no constraints on the secondary structure, it is a useless motif." << endl;
continue;
}
if (verbose_) cout << "at position ";
for (const Link& l : temp_motif.links_)
{
if (verbose_) cout << l.nts.first << ',' << l.nts.second << ' ';
if (!allowed_basepair(l.nts.first, l.nts.second)) {
if (verbose_) cout << "(unlikely) ";
unprobable = true;
}
}
if (unprobable) {
if (verbose_) cout << ", discarded because of unlikely or impossible basepairs" << endl;
continue;
if (verbose_) cout << l.nts.first << ',' << l.nts.second << ' ';
if (!allowed_basepair(l.nts.first, l.nts.second)) {
if (verbose_) cout << "(unlikely) ";
unprobable = true;
}
if (verbose_) cout << endl;
// Add it to the results vector
unique_lock<mutex> lock(posInsertionSites_access);
insertion_sites_.push_back(temp_motif);
lock.unlock();
}
component_sequences.clear();
}
if (unprobable) {
if (verbose_) cout << ", discarded because of unlikely or impossible basepairs" << endl;
continue;
}
if (verbose_) cout << endl;
// Add it to the results vector
unique_lock<mutex> lock(posInsertionSites_access);
insertion_sites_.push_back(temp_motif);
lock.unlock();
}
}
\ No newline at end of file
......
......@@ -12,12 +12,19 @@
using std::vector;
typedef struct args_ {
typedef struct argsf_ {
path motif_file;
std::mutex& posInsertionSites_mutex;
args_(path motif_file_, mutex& mutex_) : motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
} args_of_parallel_func;
argsf_(path motif_file_, mutex& mutex_)
: motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
} file_and_mutex;
typedef struct argsm_ {
json_elem motif;
std::mutex& posInsertionSites_mutex;
argsm_(json_elem& motif_, mutex& mutex_)
: motif(motif_), posInsertionSites_mutex(mutex_) {}
} motif_and_mutex;
class MOIP
{
......@@ -56,9 +63,9 @@ class MOIP
bool exists_vertical_outdated_labels(const SecondaryStructure& s) const;
bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
void allowed_motifs_from_desc(args_of_parallel_func arg_struct);
void allowed_motifs_from_rin(args_of_parallel_func arg_struct);
void allowed_motifs_from_json(args_of_parallel_func arg_struct, vector<pair<uint, char>> errors_id);
void allowed_motifs_from_desc(file_and_mutex arg_struct);
void allowed_motifs_from_rin(file_and_mutex arg_struct);
void allowed_motifs_from_json(motif_and_mutex arg_struct);
bool verbose_; // Should we print things ?
......@@ -79,8 +86,7 @@ class MOIP
vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_
vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_
vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_
vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
};
inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); }
......
......@@ -5,71 +5,36 @@
#include <regex>
#include <sstream>
#include <thread>
#include <json.hpp>
using namespace boost::filesystem;
using namespace std;
using json = nlohmann::json;
Motif::Motif(void) {}
uint Motif::delay = 1;
Motif::Motif(const vector<Component>& v, string PDB) : comp(v), PDBID(PDB)
{
is_model_ = false;
reversed_ = false;
source_ = RNA3DMOTIF;
}
Motif::Motif(const vector<Component>& v, string id, size_t nb_contacts, double tx_occurrences) : comp(v)
{
contacts_id = id;
is_model_ = false;
reversed_ = false;
source_ = CONTACTS;
contact_ = nb_contacts;
tx_occurrences_ = tx_occurrences;
}
Motif::Motif(void) {}
Motif::Motif(string csv_line)
{
vector<string> tokens;
split(tokens, csv_line, boost::is_any_of(","));
if (csv_line.find(string("True")) != std::string::npos or csv_line.find(string("False")) != std::string::npos) // This has been created by jar3d
id_ = tokens[0];
score_ = stoi(tokens[1]);
source_ = CSV;
reversed_ = false; // default value
if (csv_line.find(string("True")) != std::string::npos or csv_line.find(string("False")) != std::string::npos)
{
atlas_id = tokens[0];
// This has been created by jar3d
score_ = stoi(tokens[2]);
comp.push_back(Component(make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
if (tokens[5] != "-") comp.push_back(Component(make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
reversed_ = (tokens[1] == "True");
is_model_ = true;
PDBID = "";
source_ = RNAMOTIFATLAS;
}
else // this has been created by BayesPairing
{
score_ = stoi(tokens[1]);
// identify source:
if (tokens[0].find(string("rna3dmotif")) == std::string::npos)
{
is_model_ = true;
PDBID = "";
source_ = RNAMOTIFATLAS;
atlas_id = tokens[0];
}
else
{
is_model_ = false;
PDBID = tokens[0];
source_ = RNA3DMOTIF;
atlas_id = "";
}
uint i = 2;
//while (i < tokens.size())
while (i < tokens.size()-1)
{
if (stoi(tokens[i]) < stoi(tokens[i + 1]))
......@@ -81,12 +46,71 @@ Motif::Motif(string csv_line)
}
}
Motif::Motif(const vector<Component>& v, string name) : comp(v), id_(name)
{
source_ = RNA3DMOTIF;
reversed_ = false;
}
Motif::Motif(const vector<Component>& v, string name, string& struc) : comp(v), id_(name)
{
source_ = JSON;
reversed_ = false;
stack<uint> rbrackets;
stack<uint> sbrackets;
stack<uint> cbrackets;
stack<uint> chevrons;
uint count = 0;
uint debut = v[count].pos.first;
uint gap = 0;
for (uint i = 0; i < struc.size(); i++) {
if (struc[i] == '(') {
rbrackets.push(i + debut + gap - count);
} else if (struc[i] == ')') {
Link l;
l.nts.first = rbrackets.top();
l.nts.second = i + debut + gap - count;
links_.push_back(l);
rbrackets.pop();
} else if (struc[i] == '[') {
sbrackets.push(i + debut + gap - count);
} else if (struc[i] == ']') {
Link l;
l.nts.first = sbrackets.top();
l.nts.second = i + debut + gap - count;
links_.push_back(l);
sbrackets.pop();
} else if (struc[i] == '{') {
cbrackets.push(i + debut + gap - count);
} else if (struc[i] == '}') {
Link l;
l.nts.first = cbrackets.top();
l.nts.second = i + debut + gap - count;
links_.push_back(l);
cbrackets.pop();
} else if (struc[i] == '<') {
chevrons.push(i + debut + gap - count);
} else if (struc[i] == '>') {
Link l;
l.nts.first = chevrons.top();
l.nts.second = i + debut + gap - count;
links_.push_back(l);
chevrons.pop();
} else if (struc[i] == '&') {
count ++;
gap += v[count].pos.first - v[count - 1].pos.second - 1;
}
}
}
Motif::Motif(const vector<Component>& v, path rinfile, uint id, bool reversed) : comp(v), reversed_(reversed)
{
// Loads a motif from the RIN file of Carnaval
carnaval_id = to_string(id);
source_ = CARNAVAL;
is_model_ = false;
id_ = to_string(id);
source_ = CARNAVAL;
std::ifstream file(rinfile);
......@@ -158,7 +182,7 @@ Motif::Motif(const vector<Component>& v, path rinfile, uint id, bool reversed) :
string Motif::pos_string(void) const
{
stringstream s;
s << atlas_id << " ( ";
s << id_ << " ( ";
for (auto c : comp) s << c.pos.first << '-' << c.pos.second << ' ';
s << ')';
return s.str();
......@@ -185,10 +209,9 @@ string Motif::sec_struct(void) const
string Motif::get_identifier(void) const
{
switch (source_) {
case RNAMOTIFATLAS: return atlas_id; break;
case CARNAVAL: return string("RIN") + carnaval_id; break;
case CONTACTS: return string("JSON") + contacts_id; break;
default: return PDBID;
case CARNAVAL: return string("RIN") + id_; break;
case JSON: return string("JSON") + id_; break;
default: return id_;
}
}
......@@ -265,167 +288,52 @@ char Motif::is_valid_RIN(const string& rinfile)
return (char) 0;
}
vector<pair<uint,char>> Motif::is_valid_JSON(const string& jsonfile)
char Motif::is_valid_JSON(const json_elem& i)
{
// returns 0 if no errors
std::ifstream motif;
motif = std::ifstream(jsonfile);
json js = json::parse(motif);
vector<pair<uint,char>> errors_id;
string seq = i.value()["sequence"];
string ss = i.value()["struct2d"];
if (ss.size() != seq.size()) return 'x';
if (!check_motif_sequence(seq)) return 'r';
if (!check_motif_ss(ss)) return 'n';
if (seq.empty()) return 'e';
if (seq.size() - count(seq.begin(),seq.end(), '&') < 4) return 'l';
// Iterate on components to check their length
string subseq;
vector<string> components;
uint fin = 0;
std::string keys[6] = {"contacts", "occurences", "pdb", "pfam", "sequence", "struct2d"};
// Iterating over Motifs
for (auto i = js.begin(); i != js.end(); ++i) {
int j = 0;
string id = i.key();
size_t size;
string complete_seq;
string contacts;
// Iterating over json keys
for (auto it = js[id].begin(); it != js[id].end(); ++it) {
// it.key() contains the key, and it.value() the field's value.
string curr_key = it.key();
if (curr_key.compare(keys[j]))
{
//std::cout << "error header : keys[" << j << "]: " << keys[j] << " vs curr_key: " << curr_key << endl;
errors_id.push_back(make_pair(stoi(id), 'd'));
//return 'd';
}
else if(!curr_key.compare(keys[5])) // This is the secondary structure field
{
string ss = it.value();
if (ss.empty()) {
errors_id.push_back(make_pair(stoi(id), 'f'));
break;
} else if (!checkSecondaryStructure(ss)) {
errors_id.push_back(make_pair(stoi(id), 'n'));
break;
} else if (ss.size() != complete_seq.size()) {
errors_id.push_back(make_pair(stoi(id), 'x'));
break;
}
}
else if(!curr_key.compare(keys[0])) // This is the contacts field
{
contacts = it.value();
}
else if (!curr_key.compare(keys[4])) // This is the sequence field
{
string seq = it.value();
size = count_nucleotide(seq);
complete_seq = seq;
if (seq.empty()) {
errors_id.push_back(make_pair(stoi(id), 'e'));
break;
}
if (size < 4) {
errors_id.push_back(make_pair(stoi(id), 'l'));
break;
}
size_t count_contact = count_delimiter(contacts);
size_t count_seq = count_delimiter(seq);
if (count_contact != count_seq) {
errors_id.push_back(make_pair(stoi(id), 'a'));
break;
}
if (contacts.size() != seq.size()) {
errors_id.push_back(make_pair(stoi(id), 'b'));
break;
}
// Iterate on components to check their length
string subseq;
while((seq.find('&') != string::npos)) {
fin = seq.find('&');
subseq = seq.substr(0, fin);
seq = seq.substr(fin + 1);
if (subseq.size() >= 2) {
components.push_back(subseq);
} else {
//errors_id.push_back(make_pair(stoi(id), 'k'));
}
}
if (seq.size() >= 2) { // Last component after the last &
components.push_back(seq);
} else {
//errors_id.push_back(make_pair(stoi(id), 'k'));
}
size_t n = 0;
for (uint ii = 0; ii < components.size(); ii++) {
n += components[ii].size();
}
if(n <= 3) {
errors_id.push_back(make_pair(stoi(id), 'l'));
}
}
j++;
}
uint end, n=0;
while((seq.find('&') != string::npos)) {
end = seq.find('&');
subseq = seq.substr(0, end);
seq = seq.substr(end + 1);
if (subseq.size() >= 2) components.push_back(subseq);
}
return errors_id;
}
if (seq.size() >= 2) // Last component after the last &
components.push_back(seq);
for (auto comp : components)
n += comp.size();
if(n <= 3) return 'l';
//count the number of nucleotide in the motif sequence
size_t count_nucleotide(string& seq) {
size_t count = 0;
for(uint i = 0; i < seq.size(); i++) {
char c = seq.at(i);
if (c != '&') {
count++;
}
}
return count;
}
//count the number of '&' in the motif sequence
size_t count_delimiter(string& seq) {
size_t count = 0;
for(uint i = 0; i < seq.size(); i++) {
char c = seq.at(i);
if (c == '&') {
count++;
}
}
return count;
}
//count the number of '*' in the motif
size_t count_contacts(string& contacts) {
size_t count = 0;
for (uint i = 0; i < contacts.size(); i++) {
if (contacts[i] == '*') {
count++;
}
}
return count;
return char(0);
}
//Check if the sequence is a rna sequence (ATGC) and replace T by U or remove modified nucleotide if necessary
string check_motif_sequence(string seq) {
std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
for (int i = seq.size(); i >= 0; i--) {
if(seq[i] == 'T') {
seq[i] = 'U';
} else if (!(seq [i] == 'A' || seq [i] == 'U' || seq [i] == '&'
|| seq [i] == 'G' || seq [i] == 'C')) {
seq = seq.erase(i,1);
//Check if the sequence is a rna sequence (ATUGC-only)
bool check_motif_sequence(string seq) {
transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
for (char c : seq)
if (!(c == 'A' || c == 'U' || c == 'T' || c == '&' || c == 'G' || c == 'C')) {
return false;
}
}
return seq;
return true;
}
//check that there are as many opening parentheses as closing ones
bool checkSecondaryStructure(string struc)
// Check that there are as many opening brackets as closing ones
bool check_motif_ss(string struc)
{
stack<uint> parentheses;
stack<uint> crochets;
stack<uint> accolades;
stack<uint> rbrackets;
stack<uint> sbrackets;
stack<uint> cbrackets;
stack<uint> chevrons;
for (uint i = 0; i < struc.length(); i++)
{
......@@ -439,27 +347,27 @@ bool checkSecondaryStructure(string struc)
} else {
for (uint i = 0; i < struc.size(); i++) {
if (struc[i] == '(') {
parentheses.push(i);
rbrackets.push(i);
} else if (struc[i] == ')') {
if (!parentheses.empty())
parentheses.pop();
if (!rbrackets.empty())
rbrackets.pop();
else return false;
} else if (struc[i] == '[') {
crochets.push(i);
sbrackets.push(i);
} else if (struc[i] == ']') {
if (!crochets.empty())
crochets.pop();
if (!sbrackets.empty())
sbrackets.pop();
else return false;
} else if (struc[i] == '{') {
accolades.push(i);
cbrackets.push(i);
} else if (struc[i] == '}') {
if (!accolades.empty())
accolades.pop();
if (!cbrackets.empty())
cbrackets.pop();
else return false;
} else if (struc[i] == '<') {
......@@ -473,140 +381,7 @@ bool checkSecondaryStructure(string struc)
}
}
}
return (parentheses.empty() && crochets.empty() && accolades.empty() && chevrons.empty());
}
// Converts a dot-bracket motif secondary structure to vector of Link
vector<Link> build_motif_pairs(string& struc, vector<Component>& v) {
vector<Link> vec;
stack<uint> parentheses;
stack<uint> crochets;
stack<uint> accolades;
stack<uint> chevrons;
uint count = 0;
uint debut = v[count].pos.first;
uint gap = 0;
for (uint i = 0; i < struc.size(); i++) {
if (struc[i] == '(') {
parentheses.push(i + debut + gap - count);
} else if (struc[i] == ')') {
Link l;
l.nts.first = parentheses.top();
l.nts.second = i + debut + gap - count;
vec.push_back(l);
parentheses.pop();
} else if (struc[i] == '[') {
crochets.push(i + debut + gap - count);
} else if (struc[i] == ']') {
Link l;
l.nts.first = crochets.top();
l.nts.second = i + debut + gap - count;
vec.push_back(l);
crochets.pop();
} else if (struc[i] == '{') {
accolades.push(i + debut + gap - count);
} else if (struc[i] == '}') {
Link l;
l.nts.first = accolades.top();
l.nts.second = i + debut + gap - count;
vec.push_back(l);
accolades.pop();
} else if (struc[i] == '<') {
chevrons.push(i + debut + gap - count);
} else if (struc[i] == '>') {
Link l;
l.nts.first = chevrons.top();
l.nts.second = i + debut + gap - count;
vec.push_back(l);
chevrons.pop();
} else if (struc[i] == '&') {
count ++;
gap += v[count].pos.first - v[count - 1].pos.second - 1;
}
}
return vec;
}
uint find_max_occurrences(string& filepath) {
uint max = 0;
std::ifstream in = std::ifstream(filepath);
json js = json::parse(in);
string contacts_id;
for(auto it = js.begin(); it != js.end(); ++it) {
contacts_id = it.key();
for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
string test = it2.key();
if (!test.compare("occurences")) {
uint occ = it2.value();
if (occ > max) {
max = occ;
}
}
}
}
return max;
}
uint find_max_sequence(string& filepath) {
uint max = 0;
std::ifstream in = std::ifstream(filepath);
json js = json::parse(in);
string contacts_id;
string seq;
for(auto it = js.begin(); it != js.end(); ++it) {
contacts_id = it.key();
for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
string test = it2.key();
if (!test.compare("sequence")) {
seq = it2.value();
uint size = seq.size();
if (size > max) {
max = size;
}
}
}
}
return max;
}
vector<string> find_components(string& sequence, string delimiter) {
vector<string> list;
string seq = sequence;
string subseq;
uint fin = 0;
while(seq.find(delimiter) != string::npos) {
fin = seq.find(delimiter);
subseq = seq.substr(0, fin);
seq = seq.substr(fin + 1);
list.push_back(subseq); // new component sequence
}
if (!seq.empty()) {
list.push_back(seq);
}
return list;
}
vector<uint> find_contacts(vector<string>& struc2d, vector<Component>& v) {
vector<uint> positions;
string delimiter = "*";
uint debut;
for (uint i = 0; i < v.size(); i++) {
debut = v[i].pos.first;
uint pos = struc2d[i].find(delimiter, 0);
while(pos != string::npos && pos <= struc2d[i].size())
{
positions.push_back(pos + debut);
pos = struc2d[i].find(delimiter, pos+1);
}
}
return positions;
return (rbrackets.empty() && sbrackets.empty() && cbrackets.empty() && chevrons.empty());
}
bool is_desc_insertible(const string& descfile, const string& rna)
......@@ -652,10 +427,9 @@ bool is_desc_insertible(const string& descfile, const string& rna)
regex e(seq);
return regex_search(rna, m, e);
}
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc)
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string> vc)
{
pair<uint, uint> pos;
vector<vector<Component>> results;
......@@ -663,122 +437,41 @@ vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<stri
vector<string> next_seqs;
regex c(vc[0]);
//cout << "\t\t>Searching " << vc[0] << " in " << rna << endl;
if (vc.size() > 1) {
//cout << "size vc: " << vc.sizef() << endl;
if (regex_search(rna, c)) {
if (vc.size() > 2) {
next_seqs = vector<string>(&vc[1], &vc[vc.size()]);
}
else {
next_seqs = vector<string>(1, vc.back());
}
uint j = 0;
// For every regexp match
for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
smatch match = *i;
pos.first = match.position() + offset;
pos.second = pos.first + match.length() - 1;
//cout << "\t\t>Inserting " << vc[j] << " in [" << pos.first << ',' << pos.second << "]" << endl;
// +5 because HL < 3 pbs but not for CaRNAval or Contacts
// if CaRNAval or Contacts +2 is better
if (pos.second - offset + 5 >= rna.length()) {
//cout << "\t\t... but we cannot place the next components : Ignored." << endl;
continue;
}
next_ones = find_next_ones_in(rna.substr(pos.second - offset + 5), pos.second + 5, next_seqs);
if (!next_ones.size()) {
// cout << "\t\t... but we cannot place the next components : Ignored.2" << endl;
continue;
}
//cout << endl;
for (vector<Component> v : next_ones) // For every combination of the next components
{
// Combine the match for this component pos with the combination
// of next_ones as a whole solution
vector<Component> r;
r.push_back(Component(pos));
for (Component& c : v) r.push_back(c);
results.push_back(r);
}
j++;
}
}
} else {
// Only one more component to find
if (regex_search(rna, c)) {
// For each regexp match
for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
smatch match = *i;
pos.first = match.position() + offset;
pos.second = pos.first + match.length() - 1;
//cout << "\t\t>Inserting " << vc[0] << " in [" << pos.first << ',' << pos.second << "]" << endl;
// Create a vector of component with one component for that match
vector<Component> r;
r.push_back(Component(pos));
results.push_back(r);
}
}
}
return results;
}
vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc)
{
pair<uint, uint> pos;
vector<vector<Component>> results;
vector<vector<Component>> next_ones;
vector<string> next_seqs;
regex c(vc[0]);
//cout << "\t\t>Searching " << vc[0] << " in " << rna << endl;
if (vc.size() > 1) {
//cout << "size vc: " << vc.size() << endl;
if (regex_search(rna, c)) {
// Define a vector with the remaining components to insert behind vc[0]
if (vc.size() > 2) {
next_seqs = vector<string>(&vc[1], &vc[vc.size()]);
}
else {
next_seqs = vector<string>(1, vc.back());
}
uint j = 0;
// For every regexp match
for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
//retrieve the hit position
smatch match = *i;
pos.first = match.position() + offset;
pos.second = pos.first + match.length() - 1;
//cout << "\t\t>Inserting " << vc[j] << " in [" << pos.first << ',' << pos.second << "]" << endl;
// Check if we can insert the next components after this hit for vc[0]
// +5 because HL < 3 pbs but not for CaRNAval or Contacts
// if CaRNAval or Contacts +2 is better
if (pos.second - offset + 2 >= rna.length()) {
//cout << "\t\t... but we cannot place the next components : Ignored." << endl;
continue;
}
next_ones = json_find_next_ones_in(rna.substr(pos.second - offset + 2), pos.second + 2, next_seqs);
if (!next_ones.size()) {
// cout << "\t\t... but we cannot place the next components : Ignored.2" << endl;
continue;
}
//cout << endl;
for (vector<Component> v : next_ones) // For every combination of the next components
if (pos.second - offset + Motif::delay >= rna.length()) break; // no room for the next components. give up.
next_ones = find_next_ones_in(rna.substr(pos.second - offset + Motif::delay), pos.second + Motif::delay, next_seqs);
if (!next_ones.size()) break; // no matches for the next components. give up.
// For every solution for the next components, combine this solution for vc[0] (pos) with the one for the following vc[i] (v)
for (vector<Component> v : next_ones)
{
// Combine the match for this component pos with the combination
// of next_ones as a whole solution
vector<Component> r;
r.push_back(Component(pos));
for (Component& c : v) r.push_back(c);
r.push_back(Component(pos)); // solution for vc[0]
for (Component& c : v) r.push_back(c); // solutions for the following vc[i]
results.push_back(r);
}
j++;
}
}
} else {
......@@ -790,8 +483,6 @@ vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector
pos.first = match.position() + offset;
pos.second = pos.first + match.length() - 1;
//cout << "\t\t>Inserting " << vc[0] << " in [" << pos.first << ',' << pos.second << "]" << endl;
// Create a vector of component with one component for that match
vector<Component> r;
r.push_back(Component(pos));
......
......@@ -7,6 +7,7 @@
#include <vector>
#include <filesystem>
#include "rna.h"
#include "json.hpp"
using boost::filesystem::path;
using std::pair;
......@@ -14,6 +15,8 @@ using std::string;
using std::vector;
using std::mutex;
typedef enum { RNA3DMOTIF = 1, CSV = 2, CARNAVAL = 3, JSON = 4 } source_type;
typedef nlohmann::detail::iter_impl<nlohmann::basic_json<> > json_elem;
typedef struct Comp_ {
......@@ -39,18 +42,17 @@ typedef struct Link
class Motif
{
public:
public:
Motif(void);
Motif(string csv_line);
Motif(const vector<Component>& v, string PDB);
Motif(const vector<Component>& v, string id, size_t contacts, double tx_occurrences);
Motif(const vector<Component>& v, string name);
Motif(const vector<Component>& v, string name, string& struc);
Motif(const vector<Component>& v, path rinfile, uint id, bool reversed);
// Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
static char is_valid_RIN(const string& rinfile);
static char is_valid_DESC(const string& descfile);
static vector<pair<uint,char>> is_valid_JSON(const string& jsonfile);
static char is_valid_RIN(const string& rinfile);
static char is_valid_DESC(const string& descfile);
static char is_valid_JSON(const json_elem& i);
string pos_string(void) const;
string sec_struct(void) const;
......@@ -64,39 +66,27 @@ class Motif
double tx_occurrences_;
double score_;
bool reversed_;
private:
string carnaval_id; // if source = CARNAVAL
string atlas_id; // if source = RNAMOTIFATLAS
string PDBID; // if source = RNA3DMOTIF
string contacts_id; // if source = CONTACTS
bool is_model_; // Whether the motif is a model or an extracted module from a 3D structure
enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3, CONTACTS = 4 } source_;
static uint delay;
// delay is the minimal shift between end of a component and begining of the next.
// For regular loop motifs, it should be at least 5 (because hairpins cannot be of size smaller than 5).
// For the general case, it could be zero, but solutions will look dirty...
// Higher values reduce combinatorial explosion of potential insertion sites.
private:
string id_;
source_type source_;
};
bool is_desc_insertible(const string& descfile, const string& rna);
bool is_rin_insertible(const string& rinfile, const string& rna);
bool is_json_insertible(const string& jsonfile, const string& rna);
bool check_motif_ss(string);
bool check_motif_sequence(string);
vector<Motif> load_txt_folder(const string& path, const string& rna, bool verbose);
vector<Motif> load_desc_folder(const string& path, const string& rna, bool verbose);
vector<Motif> load_csv(const string& path);
vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose);
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc);
vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
// utilities for Json motifs
size_t count_nucleotide(string&);
size_t count_delimiter(string&);
size_t count_contacts(string&);
string check_motif_sequence(string);
bool checkSecondaryStructure(string);
vector<Link> build_motif_pairs(string&, vector<Component>&);
uint find_max_occurrences(string&);
uint find_max_sequence(string&);
vector<string> find_components(string&, string);
vector<uint> find_contacts(vector<string>&, vector<Component>&);
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string> vc);
// utilities to compare secondary structures:
bool operator==(const Motif& m1, const Motif& m2);
......
......@@ -3,16 +3,12 @@
#include <algorithm>
#include <boost/format.hpp>
#define RESET "\033[0m"
#define RED "\033[31m" /* Red */
using std::abs;
using std::cout;
using std::endl;
SecondaryStructure::SecondaryStructure() {}
SecondaryStructure::SecondaryStructure(const RNA& rna)
: objective_scores_(vector<double>(2)), n_(rna.get_RNA_length()), nBP_(0), rna_(rna)
{
......@@ -21,8 +17,6 @@ SecondaryStructure::SecondaryStructure(const RNA& rna)
SecondaryStructure::SecondaryStructure(bool empty) : rna_(RNA()) { is_empty_structure = empty; }
string SecondaryStructure::to_DBN(void) const
{
......@@ -100,26 +94,6 @@ string SecondaryStructure::to_DBN(void) const
return res;
}
string structure_with_contacts(const SecondaryStructure& ss) {
string sequence = ss.rna_.get_seq();
string construct = "";
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : ss.motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
construct += "*";
} else {
construct += ".";
}
}
return construct;
}
string SecondaryStructure::to_string(void) const
{
string s;
......@@ -141,35 +115,11 @@ void SecondaryStructure::set_basepair(uint i, uint j)
void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
void colored_contacts(string sequence, vector<Motif> motif_info_) {
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
cout << RED << sequence[i] << RESET;
} else {
cout << sequence[i];
}
}
}
void SecondaryStructure::print(void) const
{
cout << endl;
cout << '\t';
colored_contacts(rna_.get_seq(), motif_info_);
//rna_.get_seq()
cout << endl;
cout << endl << '\t' << rna_.get_seq() << endl;
string ss = to_string();
cout << '\t';
colored_contacts(ss, motif_info_);
//cout << ss;
cout << endl;
cout << '\t' << ss << endl;
for (const Motif& m : motif_info_) {
uint i = 0;
cout << '\t';
......@@ -324,7 +274,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2)
return false;
}
bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2)
{
// Checks wether the secondary structures are exactly the same, including the inserted motifs.
......
......@@ -57,7 +57,4 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
string structure_with_contacts(const SecondaryStructure& ss);
#endif // SECONDARY_STRUCTURE_
\ No newline at end of file
......
......@@ -79,12 +79,10 @@ int main(int argc, char* argv[])
("jsonfolder,j", po::value<string>(&motifs_path_name), "A folder containing a custom motif library in .json format")
("pre-placed,x", po::value<string>(&motifs_path_name), "A CSV file providing motif insertion sites obtained with another tool.")
("function,f", po::value<char>(&obj_function_nbr)->default_value('B'),
"(A, B, C, D, E or F) Objective function to score module insertions:\n"
"(A, B, C, or D) Objective function to score module insertions:\n"
" (A) insert big modules\n (B) light, high-order modules\n"
" (C) well-scored modules\n (D) light, high-order, well-scored\n modules\n"
" (E, F) insert big modules with many\n contacts with proteins, different\n ponderations.\n"
" C and D require position scores\n provided by --pre-placed.\n"
" E and F require protein-contact\n information and should be\n used only with --jsonfolder.")
" C and D require position scores\n provided by --pre-placed.\n")
("mfe,E", "Minimize stacking energies\n (leads to MFE extimator)")
("mea,A", "(default) Maximize expected accuracy\n (leads to MEA estimator)")
("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(2),
......@@ -108,8 +106,8 @@ int main(int argc, char* argv[])
<< "Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules." << endl
<< "Developped by Louis Becquey, 2018-2021\nLénaïc Durand, 2019\nNathalie Bernard, 2021" << endl << endl
<< "Usage:\tYou must provide:\n\t1) a FASTA input file with -s," << endl
<< "\t2) a module type with --rna3dmotifs, --carnaval, --contacts or --pre-placed," << endl
<< "\t3) one module-based scoring function with --func A, B, C, D, E or F," << endl
<< "\t2) a module type with --rna3dmotifs, --carnaval, --json or --pre-placed," << endl
<< "\t3) one module-based scoring function with --func A, B, C, or D" << endl
<< "\t4) one energy-based scoring function with --mfe or --mea," << endl
<< "\t5) how to display results: in console (-v), or in a result file (-o)." << endl
<< endl
......@@ -154,19 +152,22 @@ int main(int argc, char* argv[])
return EXIT_FAILURE;
}
/* FIND PARETO SET */
string source;
Motif::delay = 1;
if (vm.count("rinfolder"))
source = "rinfolder";
else if (vm.count("descfolder"))
else if (vm.count("descfolder")) {
source = "descfolder";
Motif::delay = 5;
}
else if (vm.count("jsonfolder"))
source = "jsonfolder";
else if (vm.count("pre-placed"))
source = "csvfile";
else
cerr << "ERR: no source of modules provided !" << endl;
/* FIND PARETO SET */
MOIP myMOIP = MOIP(myRNA, source, motifs_path_name.c_str(), theta_p_threshold, verbose);
double min, max;
......@@ -243,11 +244,8 @@ int main(int argc, char* argv[])
outfile.open(outputName);
outfile << fa->name() << endl << fa->seq() << endl;
for (uint i = 0; i < myMOIP.get_n_solutions(); i++) {
outfile << myMOIP.solution(i).to_string() << endl << structure_with_contacts(myMOIP.solution(i)) << endl;
string str1 = myMOIP.solution(i).to_string();
}
for (uint i = 0; i < myMOIP.get_n_solutions(); i++)
outfile << myMOIP.solution(i).to_string() << endl;
outfile.close();
}
......
No preview for this file type
......@@ -50,6 +50,7 @@ RNA::RNA(string name, string seq, bool verbose)
int max_bp_span = 100;
float cutoff = 1e-6;
vrna_ep_t* results = vrna_pfl_fold(cseq, window_size, max_bp_span, cutoff);
vrna_ep_t* save = results; // keep the pointer to free it later
if (results != NULL)
{
......@@ -95,7 +96,9 @@ RNA::RNA(string name, string seq, bool verbose)
}
}
}
// Free memory allocated by ViennaRNA
free(save);
}
else cerr << "NULL result returned by vrna_pfl_fold" << endl;
......