Big cleanup for JSON format support

Louis BECQUEY
Commit 89aa05565af08c7b5a42acfd2542a0ce6183bd23 89aa0556 1 parent 3b2bbb99
Showing 11 changed files with 297 additions and 1479 deletions
Isaure_benchmark.py
Makefile
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/Motif.cpp
cppsrc/Motif.h
cppsrc/SecondaryStructure.cpp
cppsrc/SecondaryStructure.h
cppsrc/biorseo.cpp
cppsrc/program
cppsrc/rna.cpp
--- a/Isaure_benchmark.py deleted 100644 → 0
View file @3b2bbb9
+++ b/Isaure_benchmark.py deleted 100644 → 0
View file @3b2bbb9
- import time
- import subprocess
- import os
- import os.path
- from math import sqrt, ceil
- import numpy as np
- import matplotlib.pyplot as plt
- 
- log_path = "test.log"
- log = open(log_path, 'a')
- 
- def run_test(cmd, log):
-     log.write(time.asctime(time.localtime(time.time())) + " : Run process \"" + cmd + "\"\n")
-     log.flush()
-     process = subprocess.Popen(cmd.split(' ') ,shell=False,stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-     # Poll process.stdout to show stdout live
-     while process.poll() is None:
-       output = process.stdout.readline()
-       if output:
-         log.write(output.decode())
-         log.flush()
-     rc = process.poll()
- 
- 
- #create the command line to run BiORSEO with the modules library of CaRNAval
- def create_command_rin(path, name, function, estimator):
-     cmd = ("python3 " + path + "/biorseo.py -i " +
-       path + "/data/fasta/" +
-       name + ".fa  " +
-       "-O results/ " +
-       "--carnaval " +
-       "--patternmatch " +
-       "--func " + function + " --" + estimator + " -v " +
-       " --biorseo-dir " + path + " " +
-       "--modules-path " + path + "/data/modules/RIN/Subfiles")
-     return cmd
- 
- #create the command line to run BiORSEO with the modules library of RNA3dMotifs Atlas
- def create_command_bgsu(path, name, function, estimator):
-     cmd = ("python3 " + path + "/biorseo.py -i " +
-       path + "/data/fasta/" +
-       name + ".fa  " +
-       "-O results/ " +
-       "--3dmotifatlas " +
-       "--jar3d " +
-       "--func " + function + " --" + estimator + " -v " +
-       "--jar3d-exec " + " /local/local/localopt/jar3d_2014-12-11.jar" +
-       " --biorseo-dir " + path + " " +
-       "--modules-path " + path + "/data/modules/BGSU")
-     return cmd
- 
- #create the command line to run BiORSEO with the modules library of RNA3dMotifs
- def create_command_desc(path, name, function, estimator):
-     cmd = ("python3 " + path + "/biorseo.py -i " +
-       path + "/data/fasta/" +
-       name + ".fa  " +
-       "-O results/ " +
-       "--rna3dmotifs " +
-       "--patternmatch " +
-       "--func " + function + " --" + estimator + " -v " +
-       " --biorseo-dir " + path + " " +
-       "--modules-path " + path + "/data/modules/DESC")
-     return cmd
- 
- #create the command line to run BiORSEO with the motifs library of Isaure in .json
- def create_command_isaure(path, name, function, estimator):
-     cmd = ("python3 " + path + "/biorseo.py -i " +
-       path + "/data/fasta/" +
-       name + ".fa  " +
-       "-O results/ " +
-       "--contacts " +
-       "--patternmatch " +
-       "--func " + function + " --" + estimator +
-       " --biorseo-dir " + path + " " +
-       "--modules-path " + path + "/data/modules/ISAURE/bibliotheque_a_lire")
-     return cmd
- 
- #execute the command line correspondin to the information put in the argument.
- def execute_command(path, function, estimator, true_ctc, true_str, list_ctc, list_str, modules):
-     if (modules == 'desc'):
-         cmd = create_command_desc(path, name, function, estimator)
-     elif (modules == 'bgsu'):
-         cmd = create_command_bgsu(path, name, function, estimator)
-     elif (modules == 'rin'):
-         cmd = create_command_rin(path, name, function, estimator)
-     elif (modules == 'isaure'):
-         cmd = create_command_isaure(path, name, function, estimator)
- 
-     os.system(cmd)
- 
-     """file_path = "results/test_" + name + ".json_pm" + function + "_" + estimator
-     if os.path.isfile(file_path):
-         tab = write_mcc_in_file(name, true_ctc, true_str, estimator, function)
-         list_ctc.append(tab[0])
-         list_str.append(tab[1])"""
- 
- #Retrieves the list of structures predicted by Biorseo for each sequence in the benchmark.txt file
- def get_list_str_by_seq(name, estimator, function, list_str, true_str, modules):
-     if modules == 'bgsu':
-         extension = ".jar3d"
-     elif modules == 'desc':
-         extension = ".desc_pm"
-     elif modules == 'rin':
-         extension = ".rin_pm"
-     elif modules == 'json':
-         extension = ".json_pm"
-     file_path = "results/test_" + name + extension + function + "_" + estimator
-     if os.path.isfile(file_path):
-         path_benchmark = "data/modules/ISAURE/benchmark.txt"
-         max_mcc = get_mcc_structs_max(path_benchmark, name, estimator, function, extension, modules, true_str)
-         list_str.append(max_mcc)
- 
- # ================== Code from Louis Becquey Benchark.py ==============================
- def dbn_to_basepairs(structure):
-     parenthesis = []
-     brackets = []
-     braces = []
-     rafters = []
-     basepairs = []
-     As = []
-     Bs = []
-     for i, c in enumerate(structure):
-         if c == '(':
-             parenthesis.append(i)
-         if c == '[':
-             brackets.append(i)
-         if c == '{':
-             braces.append(i)
-         if c == '<':
-             rafters.append(i)
-         if c == 'A':
-             As.append(i)
-         if c == 'B':
-             Bs.append(i)
-         if c == '.':
-             continue
-         if c == ')':
-             basepairs.append((i, parenthesis.pop()))
-         if c == ']':
-             basepairs.append((i, brackets.pop()))
-         if c == '}':
-             basepairs.append((i, braces.pop()))
-         if c == '>':
-             basepairs.append((i, rafters.pop()))
-         if c == 'a':
-             basepairs.append((i, As.pop()))
-         if c == 'b':
-             basepairs.append((i, Bs.pop()))
-     return basepairs
- 
- def compare_two_contacts(true_ctc, prediction):
-     tp = 0
-     fp = 0
-     tn = 0
-     fn = 0
-     for i in range(len(true_ctc)):
-         if true_ctc[i] == '*' and prediction[i] == '*':
-             tp += 1
-         elif true_ctc[i] == '.' and prediction[i] == '.':
-             tn += 1
-         elif true_ctc[i] == '.' and prediction[i] == '*':
-             fp += 1
-         elif true_ctc[i] == '*' and prediction[i] == '.':
-             fn += 1
-     return [tp, tn, fp, fn]
- 
- def compare_two_structures(true2d, prediction):
-     true_basepairs = dbn_to_basepairs(true2d)
-     pred_basepairs = dbn_to_basepairs(prediction)
-     tp = 0
-     fp = 0
-     tn = 0
-     fn = 0
-     for bp in true_basepairs:
-         if bp in pred_basepairs:
-             tp += 1
-         else:
-             fn += 1
-     for bp in pred_basepairs:
-         if bp not in true_basepairs:
-             fp += 1
-     tn = len(true2d) * (len(true2d) - 1) * 0.5 - fp - fn - tp
-     return [tp, tn, fp, fn]
- 
- def mattews_corr_coeff(tp, tn, fp, fn):
-     if ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0):
-         #print("warning: division by zero! no contact in the prediction")
-         #print("tp: " + str(tp) + " fp: " + str(fp) + " tn: " + str(tn) + " fn: " + str(fn))
-         return -1
-     elif (tp + fp == 0):
-         print("We have an issue : no positives detected ! (linear structure)")
-     return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
- 
- def f1_score(tp, tn, fp, fn):
-     return 2 * tp / (2 * tp + fp + fn)
- 
- def specificity(tp, tn, fp, fn):
-     return tn / (tn + fp)
- 
- # ================== Code from Louis Becquey Benchark.py ==============================
- 
- #Get the best MCC value for all prediction of the results file of the sequence in argument
- def get_mcc_structs_max(path_benchmark, sequence_id, estimator, function, extension, modules, true_structure):
-     read_prd = open("results/test_" + sequence_id + extension + function + "_" + estimator, "r")
-     write = open("results/test_" + sequence_id + ".mcc_" + function + "_" + estimator + "_" + modules, "w")
- 
-     max_mcc_str = -1;
- 
-     title_exp = ">test_" + sequence_id + ": "
-     write.write(title_exp)
-     structure_exp = true_structure
-     write.write("structure 2d attendue:\n" + structure_exp + "\n")
- 
-     title_prd = read_prd.readline()
-     structure_prd = read_prd.readline()
-     sequence_prd = structure_prd
-     while structure_prd:
-         structure_prd = read_prd.readline()
-         if (len(structure_prd) != 0):
-             write.write("\nstructure 2d predite:\n" + structure_prd[:len(sequence_prd)] + "\n")
-             mcc_tab = compare_two_structures(structure_exp, structure_prd[:len(sequence_prd)])
-             mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-             if (max_mcc_str < mcc_str):
-                 max_mcc_str = mcc_str
-             write.write("mcc: " + str(mcc_str) + "\n")
-             contacts_prd = read_prd.readline()
-     write.write("max mcc 2D:" + str(max_mcc_str))
-     read_prd.close()
-     write.close()
-     return max_mcc_str
- 
- #Create a file that store the information concerning the MCC value obtains for each prediction of the
- #sequence in input
- def write_mcc_in_file(sequence_id, true_contacts, true_structure, estimator, function):
-     read_prd = open("results/test_" + sequence_id + ".json_pm" + function + "_" + estimator, "r")
-     write = open("results/test_" + sequence_id + ".mcc_" + function + "_" + estimator, "w")
- 
-     max_mcc_str = -1;
-     max_mcc_ctc = -1;
- 
-     title_exp = ">test_" + sequence_id + ": "
-     write.write(title_exp)
-     contacts_exp = true_contacts
-     structure_exp = true_structure
-     write.write("structure 2d attendue:\n" + structure_exp + "\n")
-     write.write("contacts attendus:\n" + contacts_exp + "\n" + len(structure_exp) * "-")
- 
-     title_prd = read_prd.readline()
-     structure_prd = read_prd.readline()
-     sequence_prd = structure_prd
-     while structure_prd:
-         structure_prd = read_prd.readline()
-         if (len(structure_prd) != 0):
-             write.write("\nstructure 2d predite:\n" + structure_prd[:len(sequence_prd)] + "\n")
-             mcc_tab = compare_two_structures(structure_exp, structure_prd[:len(sequence_prd)])
-             mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-             if (max_mcc_str < mcc_str):
-                 max_mcc_str = mcc_str
-             write.write("mcc: " + str(mcc_str) + "\n")
- 
-             contacts_prd = read_prd.readline()
-             write.write("\ncontacts predits:\n" + contacts_prd)
-             if (len(contacts_prd) == len(contacts_exp)):
-                 mcc_tab = compare_two_contacts(contacts_exp, contacts_prd)
-                 mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-                 if (max_mcc_ctc < mcc_ctc):
-                     max_mcc_ctc = mcc_ctc
-                 write.write("mcc: " + str(mcc_ctc) + "\n\n")
-             else:
-                 write.write("mcc: no expected contacts sequence or not same length between expected and predicted\n\n")
-     write.write("max mcc 2D:" + str(max_mcc_str))
-     write.write("max mcc ctc:" + str(max_mcc_ctc))
-     read_prd.close()
-     write.close()
-     return [max_mcc_ctc, max_mcc_str]
- 
- def set_axis_style(ax, labels):
-     ax.xaxis.set_tick_params(direction='out')
-     ax.xaxis.set_ticks_position('bottom')
-     ax.set_xticks(np.arange(1, len(labels) + 1))
-     ax.set_xticklabels(labels)
-     ax.set_xlim(0.25, len(labels) + 0.75)
-     ax.set_xlabel('Sample name')
- 
- #Create one violin plot to see the distribution of the best MCC (of the base pairing)
- def visualization_best_mcc_str(list_struct2d, estimator, function, modules, color, lines_color):
- 
-     print(estimator + " + " + function + ": ")
-     np_struct2d = np.array(list_struct2d)
- 
-     data_to_plot = np_struct2d
-     median_2d = np.median(np_struct2d)
-     print("mediane 2D: " + str(median_2d) + "\n")
- 
-     fig = plt.figure()
- 
-     ax = fig.add_axes([0, 0, 1, 1])
- 
-     labels = ['structure 2D']
-     ax.set_xticks(np.arange(1, len(labels) + 1))
-     ax.set_xticklabels(labels)
-     ax.set_xlabel(function)
-     ax.set_ylabel('MCC')
- 
-     violins = ax.violinplot(data_to_plot, showmedians=True)
- 
-     for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
-         vp = violins[partname]
-         vp.set_edgecolor(lines_color)
-         vp.set_linewidth(1)
- 
-     for v in violins['bodies']:
-         v.set_facecolor(color)
-     plt.savefig('visualisation_28_09' + estimator + '_' + function + '_' + modules + '.png', bbox_inches='tight')
- 
- #Create 4 violin plot to see the distribution of the best MCC (of the base pairing)
- def visualization_best_mcc_str_4_figures(list_struct2d, color, lines_color):
- 
-     np_struct2d_1 = np.array(list_struct2d[0])
-     np_struct2d_2 = np.array(list_struct2d[1])
-     np_struct2d_3 = np.array(list_struct2d[2])
-     np_struct2d_4 = np.array(list_struct2d[3])
- 
-     data_to_plot = [np_struct2d_1, np_struct2d_2, np_struct2d_3, np_struct2d_4]
-     fig = plt.figure()
-     fig.set_size_inches(6, 3)
-     ax = fig.add_axes([0, 0, 1, 1])
- 
-     labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F']
-     ax.set_xticks(np.arange(1, len(labels) + 1))
-     ax.set_xticklabels(labels)
-     #ax.set_xlim(0.25, len(labels) + 0.75)
-     ax.set_xlabel("motifs d'Isaure")
-     ax.set_ylabel('MCC (en fonction des appariements)')
- 
-     violins = ax.violinplot(data_to_plot, showmedians=True)
- 
-     for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
-         vp = violins[partname]
-         vp.set_edgecolor(lines_color)
-         vp.set_linewidth(1)
- 
-     for v in violins['bodies']:
-         v.set_facecolor(color)
-     plt.savefig('visualisation_28_09_Isaure_E_F.png', dpi=200, bbox_inches='tight')
- 
- #Create 2 violin plot to see the distribution of the best MCC (one for the MCC of
- # the base pairing and one for the contacts)
- def visualization_best_mcc(list_struct2d, list_contacts, estimator, function, modules, color, lines_color):
- 
-     print(estimator + " + " + function + ": ")
-     np_struct2d = np.array(list_struct2d)
-     np_contacts = np.array(list_contacts)
- 
-     data_to_plot = [np_struct2d, np_contacts]
-     median_2d = np.median(np_struct2d)
-     median_ctc = np.median(np_contacts)
-     print("mediane 2D: " + str(median_2d) + "\n")
-     print("mediane ctc: " + str(median_ctc) + "\n")
- 
-     fig = plt.figure()
- 
-     ax = fig.add_axes([0, 0, 1, 1])
- 
-     labels = ['structure 2D', 'contacts']
-     ax.set_xticks(np.arange(1, len(labels) + 1))
-     ax.set_xticklabels(labels)
-     ax.set_xlabel(function)
-     ax.set_ylabel('MCC')
- 
-     violins = ax.violinplot(data_to_plot, showmedians=True)
- 
-     for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
-         vp = violins[partname]
-         vp.set_edgecolor(lines_color)
-         vp.set_linewidth(1)
- 
-     for v in violins['bodies']:
-         v.set_facecolor(color)
-     plt.savefig('visualisation_16_06_' + estimator + '_' + function + '_' + modules + '.png', bbox_inches='tight')
- 
- #Return the list of names of all the sequence in the benchmark.txt, the list of structures and the list
- # of contacts predicted in the result file of each sequence.
- # This function is only use for the result in .json_pm format files
- def get_list_structs_contacts_all(path_benchmark, estimator, function):
-     myfile = open(path_benchmark, "r")
-     list_name = []
- 
-     complete_list_struct2d = []
-     complete_list_contacts = []
- 
-     name = myfile.readline()
-     contacts = myfile.readline()
-     seq = myfile.readline()
-     structure2d = myfile.readline()
-     count = 0
-     while seq:
-         name = name[6:].strip()
-         count = count + 1
-         file_path = "results/test_" + name + ".json_pm" + function + "_" + estimator
-         if os.path.isfile(file_path):
-             file_result = open(file_path, "r")
-             list_struct2d = []
-             list_contacts = []
-             list_name.append(name)
-             title_prd = file_result.readline()
-             structure_prd = file_result.readline()
-             sequence = structure_prd
-             while structure_prd:
-                 structure_prd = file_result.readline()
-                 if (len(structure_prd) != 0):
-                     mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
-                     mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-                     list_struct2d.append(mcc_str)
- 
-                     contacts_prd = file_result.readline()
-                     if (len(contacts_prd) == len(contacts)):
-                         mcc_tab = compare_two_contacts(contacts, contacts_prd)
-                         mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-                         list_contacts.append(mcc_ctc)
-             complete_list_struct2d.append(list_struct2d)
-             complete_list_contacts.append(list_contacts)
-         name = myfile.readline()
-         contacts = myfile.readline()
-         seq = myfile.readline()
-         structure2d = myfile.readline()
-     return [list_name, complete_list_struct2d, complete_list_contacts]
-     myfile.close()
- 
- #Return the list of names of all the sequence in the benchmark.txt, the list of structures
- # predicted in the result file of each sequence.
- def get_list_structs_all(path_benchmark, estimator, function, modules):
-     if modules == 'bgsu':
-         extension = ".jar3d"
-     elif modules == 'desc':
-         extension = ".desc_pm"
-     elif modules == 'rin':
-         extension = ".rin_pm"
-     elif modules == 'json':
-         extension = ".json_pm"
- 
-     myfile = open(path_benchmark, "r")
-     list_name = []
- 
-     complete_list_struct2d = []
-     complete_list_contacts = []
- 
-     name = myfile.readline()
-     contacts = myfile.readline()
-     seq = myfile.readline()
-     structure2d = myfile.readline()
-     count = 0
-     while seq:
-         name = name[6:].strip()
-         count = count + 1
-         file_path = "results/test_" + name + extension + function + "_" + estimator
-         if os.path.isfile(file_path):
-             file_result = open(file_path, "r")
-             list_struct2d = []
-             list_name.append(name)
-             title_prd = file_result.readline()
-             structure_prd = file_result.readline()
-             sequence = structure_prd
-             while structure_prd:
-                 structure_prd = file_result.readline()
-                 if (len(structure_prd) != 0):
-                     mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
-                     mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
-                     list_struct2d.append(mcc_str)
- 
-                     contacts_prd = file_result.readline()
-             complete_list_struct2d.append(list_struct2d)
-         name = myfile.readline()
-         contacts = myfile.readline()
-         seq = myfile.readline()
-         structure2d = myfile.readline()
-     return [list_name, complete_list_struct2d]
-     myfile.close()
- 
- #Return the list in argument in two lists, each list containing half of the list in argument
- def get_half(list):
- 
-     first_half = []
-     second_half = []
-     if (len(list) % 2 == 0):
-         middle = len(list) / 2
-     else:
-         middle = len(list) / 2 + 0.5
- 
-     for i in range (int(middle)):
-         first_half.append(list[i])
- 
-     for i in range (int(middle)):
-         if i + int(middle) < len(list):
-          second_half.append(list[i + int(middle)])
- 
-     return [first_half, second_half]
- 
- #Create a boxplot with all the MCC (for the base pairing) obtains with all the prediction for each sequence,
- # divide in two figures.
- def visualization_all_mcc_str(path_benchmark, estimator, function, modules):
- 
-     list_name = get_list_structs_all(path_benchmark, estimator, function, modules)[0]
-     tab_struct2d = get_list_structs_all(path_benchmark, estimator, function, modules)[1]
- 
-     min = 20
-     max = 0
-     max_i = 0
-     min_i = 0
-     for i in range(len(tab_struct2d)):
-         if (len(tab_struct2d[i]) > max):
-             max = len(tab_struct2d[i])
-             max_i = i
-         if (len(tab_struct2d[i]) < min):
-             min = len(tab_struct2d[i])
-             min_i = i
- 
-     print("max: " + list_name[max_i] + " " + str(max) + "     min: " + list_name[min_i] + " " + str(min) + "\n")
- 
-     np_struct2d = np.array(tab_struct2d)
-     size = len(tab_struct2d)
-     list_median_str = []
-     for i in range(size):
-         list_median_str.append(np.median(np_struct2d[i]))
- 
-     all_str = []
-     for i in range(size):
-         for j in range(len(np_struct2d[i])):
-             all_str.append(np_struct2d[i][j])
- 
-     """print("mediane struct" + estimator + " + " + function + " : " + str(np.median(all_str)))
-     print("ecart struct" + estimator + " + " + function + " : " + str(np.std(all_str)) + "\n")"""
- 
-     data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
-     boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
- 
-     if (len(data) % 2 == 0):
-         absciss = len(data) / 2
-     else:
-         absciss = len(data) / 2 + 0.5
- 
-     divide_tab_name = get_half(boxName)
-     divide_tab_data = get_half(data)
- 
-     plt.figure(figsize=(15,4),dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
-     for i in range(int(absciss)):
-         y =data[i]
-         x = np.random.normal(1 + i, 0.04, size=len(y))
-         plt.scatter(x, y)
-         plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC (appariements)')
-     plt.savefig('visualisation_all_mcc_' + estimator + "_" + function + "_" + modules + '.png', bbox_inches='tight')
- 
-     plt.figure(figsize=(15, 4), dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
-     for i in range(len(data)):
-         if i + int(absciss) < len(data):
-             y = data[i + int(absciss)]
-             x = np.random.normal(1 + i, 0.04, size=len(y))
-             plt.scatter(x, y)
-             plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC')
-     plt.savefig('visualisation_all_mcc_' + estimator + "_" + function + "_" + modules + '_2.png', bbox_inches='tight')
- 
- # Create a boxplot with all the MCC (for the base pairing and the contacts) obtains with all
- # the prediction for each sequence, divide in two figures.
- def visualization_all_mcc(path_benchmark, estimator, function, modules):
- 
-     list_name = get_list_structs_contacts_all(path_benchmark, estimator, function)[0]
-     tab_struct2d = get_list_structs_contacts_all(path_benchmark, estimator, function)[1]
-     tab_contacts = get_list_structs_contacts_all(path_benchmark, estimator, function)[2]
- 
-     min = 20
-     max = 0
-     max_i = 0
-     min_i = 0
-     for i in range(len(tab_struct2d)):
-         if (len(tab_struct2d[i]) > max):
-             max = len(tab_struct2d[i])
-             max_i = i
-         if (len(tab_struct2d[i]) < min):
-             min = len(tab_struct2d[i])
-             min_i = i
- 
-     print("max: " + list_name[max_i] + " " + str(max) + "     min: " + list_name[min_i] + " " + str(min) + "\n")
- 
-     np_struct2d = np.array(tab_struct2d)
-     size = len(tab_struct2d)
-     list_median_str = []
-     for i in range(size):
-         list_median_str.append(np.median(np_struct2d[i]))
- 
-     all_str = []
-     for i in range(size):
-         for j in range(len(np_struct2d[i])):
-             all_str.append(np_struct2d[i][j])
- 
-     """print("mediane struct" + estimator + " + " + function + " : " + str(np.median(all_str)))
-     print("ecart struct" + estimator + " + " + function + " : " + str(np.std(all_str)) + "\n")"""
- 
-     data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
-     boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
- 
-     if (len(data) % 2 == 0):
-         absciss = len(data) / 2
-     else:
-         absciss = len(data) / 2 + 0.5
- 
-     divide_tab_name = get_half(boxName)
-     divide_tab_data = get_half(data)
- 
-     plt.figure(figsize=(15,4),dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
-     for i in range(int(absciss)):
-         y =data[i]
-         x = np.random.normal(1 + i, 0.04, size=len(y))
-         plt.scatter(x, y)
-         plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC (appariements)')
-     plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + "_" + modules + '.png', bbox_inches='tight')
- 
-     plt.figure(figsize=(15, 4), dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
-     for i in range(len(data)):
-         if i + int(absciss) < len(data):
-             y = data[i + int(absciss)]
-             x = np.random.normal(1 + i, 0.04, size=len(y))
-             plt.scatter(x, y)
-             plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC')
-     plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + "_" + modules + '_2.png', bbox_inches='tight')
- 
-     np_contacts = np.array(tab_contacts)
-     size = len(tab_contacts)
-     list_median_ctc = []
-     for i in range(size):
-         list_median_ctc.append(np.median(np_contacts[i]))
- 
-     all_ctc = []
-     for i in range(size):
-         for j in range(len(np_contacts[i])):
-             all_ctc.append(np_contacts[i][j])
- 
-     """print("mediane ctc" + estimator + " + " + function + " : " + str(np.median(all_ctc)))
-     print("ecart ctc" + estimator + " + " + function + " : " + str(np.std(all_ctc)) + "\n")"""
- 
-     data = [x for _, x in sorted(zip(list_median_ctc, tab_contacts))]
-     boxName = [x for _, x in sorted(zip(list_median_ctc, list_name))]
- 
-     if (len(data) % 2 == 0) :
-         absciss = len(data)/2
-     else :
-         absciss = len(data)/2 + 0.5
- 
-     divide_tab_name = get_half(boxName)
-     divide_tab_data = get_half(data)
- 
-     plt.figure(figsize=(15, 4), dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[0], medianprops=dict(color='black'))
-     for i in range(int(absciss)):
-         y = data[i]
-         x = np.random.normal(1 + i, 0.04, size=len(y))
-         plt.scatter(x, y)
-         plt.xticks(np.arange(1, absciss + 1), divide_tab_name[0])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC (contacts)')
-     plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '.png', bbox_inches='tight')
- 
-     plt.figure(figsize=(15, 4), dpi=200)
-     plt.xticks(rotation=90)
-     plt.boxplot(divide_tab_data[1], medianprops=dict(color='black'))
-     for i in range(len(data)):
-         if i + int(absciss) < len(data) :
-             y = data[i + int(absciss)]
-             x = np.random.normal(1 + i, 0.04, size=len(y))
-             plt.scatter(x, y)
-             plt.xticks(np.arange(1, absciss + 1), divide_tab_name[1])
- 
-     plt.xlabel('nom de la séquence')
-     plt.ylabel('MCC')
-     plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '_2.png', bbox_inches='tight')
- 
- # Most of those commands lines and this script work only for the benchmark_16-07-2021.json
- # file provides by Isaure Chauvot de Beauchene. This script is means to automize the execution
- # of BiORSEO for each sequence and the creation of figures.
- 
- # after compiling create_file.cpp in 'scripts' directory (compiling once is enough) :
- # clang++ create_files.cpp -o create
- #cmd = ("scripts/create")
- 
- # after compiling create_file.cpp in 'scripts' directory (compiling once is enough) :
- # clang++ add_delimiter.cpp -o addDelimiter
- #cmd0 = ("cppsrc/Scripts/addDelimiter")
- 
- # after compiling count_pattern.cpp in 'scripts' directory (compiling once is enough) :
- # clang++ count_pattern.cpp -o countPattern
- #cmd1 = ("cppsrc/Scripts/countPattern")
- 
- myfile = open("data/modules/ISAURE/benchmark.txt", "r")
- name = myfile.readline()
- contacts = myfile.readline()
- seq = myfile.readline()
- structure2d = myfile.readline()
- 
- list_struct2d = [[],[],[],[]]
- 
- # source path to the directory (for my computer)
- path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo"
- path2 = "/local/local/BiorseoNath"
- while seq:
-     name = name[6:].strip()
-     print(name)
- 
-     # after compiling delete_same_pdb.cpp in 'scripts' directory (compiling once is enough) :
-     # clang++ delete_same_pdb.cpp -o deletePdb
-     #cmd2 = ("cppsrc/Scripts/deletePdb " + name)
-     #os.system(cmd2)
- 
-     """get_list_str_by_seq(name, 'MEA', 'E', list_struct2d[0], structure2d, 'json')
-     get_list_str_by_seq(name, 'MEA', 'F', list_struct2d[1], structure2d, 'json')
-     get_list_str_by_seq(name, 'MFE', 'E', list_struct2d[2], structure2d, 'json')
-     get_list_str_by_seq(name, 'MFE', 'F', list_struct2d[3], structure2d, 'json')"""
- 
-     name = myfile.readline()
-     contacts = myfile.readline()
-     seq = myfile.readline()
-     structure2d = myfile.readline()
- 
- #visualization_best_mcc_str_4_figures(list_struct2d, 'red', '#900C3F')
- 
- """visualization_best_mcc(list_struct2d_A_MFE, list_contacts_A_MFE, 'MFE', 'A', 'rin', 'red', '#900C3F')
- visualization_best_mcc(list_struct2d_B_MFE, list_contacts_B_MFE, 'MFE', 'B', 'rin', 'blue', '#0900FF')
- visualization_best_mcc(list_struct2d_A_MEA, list_contacts_A_MEA, 'MEA', 'A', 'rin', 'red', '#900C3F')
- visualization_best_mcc(list_struct2d_B_MEA, list_contacts_B_MEA, 'MEA', 'B', 'rin', 'blue', '#0900FF')"""
- 
- 
- myfile.close()
- 
- path_benchmark = "data/modules/ISAURE/benchmark.txt"
- visualization_all_mcc_str(path_benchmark, 'MEA', 'C', 'bgsu')
- visualization_all_mcc_str(path_benchmark, 'MEA', 'D', 'bgsu')
- visualization_all_mcc_str(path_benchmark, 'MFE', 'C', 'bgsu')
- visualization_all_mcc_str(path_benchmark, 'MFE', 'D', 'bgsu')
\ No newline at end of file
--- a/Makefile
View file @89aa055
+++ b/Makefile
View file @89aa055
@@ -40,12 +40,9 @@ re: remove clean all
 .PHONY: clean
 clean:
 	$(rm) $(OBJECTS)
- 	$(rm) doc/supplementary_material.bbl doc/supplementary_material.blg doc/supplementary_material.synctex.gz doc/supplementary_material.log doc/supplementary_material.aux
- 	$(rm) doc/main_bioinformatics.bbl doc/main_bioinformatics.blg doc/main_bioinformatics.synctex.gz doc/main_bioinformatics.log doc/main_bioinformatics.aux doc/OUP_First_SBk_Bot_8401-eps-converted-to.pdf
 	@echo -e "\033[00;32mCleanup completed.\033[00m"
 
 .PHONY: remove
 remove:
 	@$(rm) $(BINDIR)/$(TARGET)
- 	@$(rm) doc/main_bioinformatics.pdf doc/supplementary_material.pdf
 	@echo -e "\033[00;32mExecutable and docs removed!\033[00m"
--- a/cppsrc/MOIP.cpp
View file @89aa055
+++ b/cppsrc/MOIP.cpp
View file @89aa055
@@ -59,7 +59,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
 {
     if (!exists(source_path))
     {
-         cerr << "!!! Hmh, i can't find that folder: " << source_path << endl;
+         cerr << "ERR: Hmh, i can't find this: " << source_path << endl;
         exit(EXIT_FAILURE);
     }
 
@@ -112,7 +112,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
 
     if (verbose_) cout << "\t> Looking for insertion sites..." << endl;
 
-     if (source == "jar3dcsv" or source == "bayespaircsv")
+     if (source == "csvfile")
     {
         std::ifstream motifs;
         string        line;
@@ -182,7 +182,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
             accepted++;
             if (is_desc_insertible(it.path().string(), rna_.get_seq()))
             {
-                 args_of_parallel_func args(it.path(), posInsertionSites_access);
+                 file_and_mutex args(it.path(), posInsertionSites_access);
                 inserted++;
                 pool.push(bind(&MOIP::allowed_motifs_from_desc, this, args)); // & is necessary to get the pointer to a member function
             }
@@ -201,7 +201,6 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
     {
         mutex         posInsertionSites_access;
         Pool          pool;
-         size_t        inserted = 0;
         size_t        accepted = 0;
         size_t        errors   = 0;
         int           num_threads = thread::hardware_concurrency() - 1;
@@ -231,8 +230,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
                 continue;
             }
             accepted++;
-             args_of_parallel_func args(it.path(), posInsertionSites_access);
-             inserted++;
+             file_and_mutex args(it.path(), posInsertionSites_access);
             pool.push(bind(&MOIP::allowed_motifs_from_rin, this, args)); // & is necessary to get the pointer to a member function
         }
         pool.done();
@@ -241,58 +239,61 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
             thread_pool.at(i).join();
 
         if (verbose){
-             cout << "\t> " << inserted << " candidate RINs on " << accepted + errors << " (" << errors << " ignored motifs), " << endl;
-             cout << "\t  " << insertion_sites_.size() << " insertion sites kept after applying probability threshold of " << theta << endl;
+             cout << "\t> " <<  insertion_sites_.size() << " candidate RINs on " << accepted + errors << " (" << errors << " ignored motifs), after applying probability threshold of " << theta << endl;
         }
     }
     else if (source == "jsonfolder")
     {
-         mutex         posInsertionSites_access;
-         Pool          pool;
-         size_t        inserted = 0;
-         size_t        accepted = 0;
-         size_t        errors   = 0;
+         mutex   posInsertionSites_access;
+         Pool    pool;
+         size_t  accepted = 0;
+         size_t  errors   = 0;
+         int     num_threads = thread::hardware_concurrency() - 1;
+         vector<thread> thread_pool;
+         std::ifstream  motif;
 
-         // Read every JSON files
-         vector<pair<uint,char>> errors_id;
-         for (auto it : recursive_directory_range(source_path))
-         {
-             errors_id = Motif::is_valid_JSON(it.path().string());
-             if (!(errors_id.empty())) // Returns error if JSON file is incorrect
-             { 
-                 for(uint j = 0; j < errors_id.size(); j++)
+         // prepare a pool of threads to process motifs in parallel
+         for (int i = 0; i < num_threads; i++) 
+             thread_pool.push_back(thread(&Pool::infinite_loop_func, &pool));
+ 
+         // Read every JSON entry and add it to the queue (iff valid)
+         motif = std::ifstream(source_path);
+         json js = json::parse(motif);
+         char error;
+         for(auto it = js.begin(); it != js.end(); ++it) {
+             if ((error = Motif::is_valid_JSON(it))) // Returns error if JSON entry is incorrect
+             {
+                 if (verbose)
                 {
-                     if(verbose) {
-                         cerr << "\t> Ignoring JSON " << errors_id[j].first;
-                         uint error = errors_id[j].second;
-                         switch (error)
-                         {
-                             case 'l': cerr << ", too short to be considered."; break;
-                             case 'x': cerr << ", sequence and secondary structure are of different size."; break;
-                             case 'd' : cerr << ", missing header."; break;
-                             case 'e' : cerr << ", sequence is empty."; break;
-                             case 'f' : cerr << ", 2D is empty."; break;
-                             case 'n' : cerr << ", brackets are not balanced."; break;
-                             case 'k' : cerr << ", a component is too small and got removed."; break;
-                             case 'a' : cerr << ", the number of components is different between contacts and sequence"; break;
-                             case 'b' : cerr << ", the number of nucleotides is different between contacts and sequence"; break;
-                             default: cerr << ", unknown reason";
-                             
-                         }
-                         cerr << endl;
+                     cerr << "\t> Ignoring motif " << it.key(); // field key, i.e. motif identifier
+                     switch (error)
+                     {
+                         case 'l': cerr << ", too short to be considered."; break;
+                         case 'x': cerr << ", sequence and secondary structure are of different size."; break;
+                         case 'e' : cerr << ", sequence is empty."; break;
+                         case 'f' : cerr << ", 2D is empty."; break;
+                         case 'r' : cerr << ", RNA sequence not recognized, only use ACGTU."; break;
+                         case 'n' : cerr << ", brackets are not balanced."; break;
+                         default: cerr << ", unknown reason";
                     }
+                     cerr << endl;
                 }
                 errors++;
+                 continue;
             }
             accepted++;
-             args_of_parallel_func args(it.path(), posInsertionSites_access);
-             inserted++;
-             allowed_motifs_from_json(args, errors_id);
-         }
 
-         if (verbose_){
-             cout << "\t  " << insertion_sites_.size() << " insertion sites kept after applying probability threshold of " << theta << endl;
+             // add a new job to the pool, to run allowed_motifs_from_json(args)
+             motif_and_mutex args(it, posInsertionSites_access);
+             pool.push(bind(&MOIP::allowed_motifs_from_json, this, args)); // & is necessary to get the pointer to a member function
         }
+         pool.done(); // we won't add new jobs
+ 
+         // Wait for jobs to complete
+         for (unsigned int i = 0; i < thread_pool.size(); i++)
+             thread_pool.at(i).join();
+ 
+         if (verbose_) cout << "\t> " <<  insertion_sites_.size() << " candidate motifs on " << accepted + errors << " (" << errors << " ignored motifs), after applying probability threshold of " << theta << endl;
     }
     else
     {
@@ -343,7 +344,6 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
     obj1 = IloExpr(env_);
     for (uint i = 0; i < insertion_sites_.size(); i++) {
         IloNum sum_k = 0;
-         IloNum kx2_wx_ax = 0;
 
         switch (obj_function_nbr_) {
         case 'A':
@@ -369,22 +369,9 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
             obj1 += IloNum(insertion_sites_[i].comp.size() * insertion_sites_[i].score_ / log2(sum_k)) *
                     insertion_dv_[index_of_first_components[i]];
             break;
- 
-         case 'E':
-             // Fonction f1E 
-             for (const Component& c : insertion_sites_[i].comp) sum_k += c.k;
-             obj1 += IloNum(sum_k * insertion_sites_[i].contact_ * insertion_sites_[i].tx_occurrences_) * insertion_dv_[index_of_first_components[i]] ;
-             break;
- 
-         case 'F':
-             // Fonction f1F
-             for (const Component& c : insertion_sites_[i].comp) sum_k += c.k;
-             kx2_wx_ax += IloNum((sum_k * sum_k * insertion_sites_[i].tx_occurrences_) + insertion_sites_[i].contact_);
-             obj1 += insertion_dv_[index_of_first_components[i]] * IloNum(kx2_wx_ax);
-             break;
- 
         }
     }
+     
     //Stacking energy parameter matrix
     double energy[7][7] = {
                 {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
@@ -396,8 +383,8 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
                 {0.0, 0.6, 1.4, 1.5, 0.5, 1.0, 0.3}
             }; 
 
-             obj2 = IloExpr(env_);
-     switch (obj_function2_nbr_) {     
+     obj2 = IloExpr(env_);
+     switch (obj_function2_nbr_) { 
         case 'a':
             // Define the MFE (Minimum Free Energy):
             for (size_t u = 0; u < rna_.get_RNA_length() - 6; u++) {
@@ -409,7 +396,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
                     }
                 }
             }
-         break;
+             break;
         case 'b':
             // Define the expected accuracy objective function:
             //MEA:
@@ -418,7 +405,7 @@ MOIP::MOIP(const RNA& rna, string source, string source_path, float theta, bool 
                     if (allowed_basepair(u, v)) obj2 += (IloNum(rna_.get_pij(u, v)) * y(u, v));
                 }
             }
-         break;
+             break;
     }
 }
 
@@ -909,7 +896,7 @@ bool MOIP::allowed_basepair(size_t u, size_t v) const
     return true;
 }
 
- void MOIP::allowed_motifs_from_desc(args_of_parallel_func arg_struct)
+ void MOIP::allowed_motifs_from_desc(file_and_mutex arg_struct)
 {
     /*
         Searches where to place some DESC module in the RNA
@@ -1085,7 +1072,7 @@ void MOIP::allowed_motifs_from_desc(args_of_parallel_func arg_struct)
     }
 }
 
- void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
+ void MOIP::allowed_motifs_from_rin(file_and_mutex arg_struct)
 {
     // Searches where to place some RINs in the RNA
 
@@ -1125,6 +1112,7 @@ void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
 
     for (vector<Component>& v : vresults)
     {
+         // Now create an actual Motif with the class
         Motif temp_motif = Motif(v, rinfile, carnaval_id, false);
 
         bool unprobable = false;
@@ -1142,114 +1130,71 @@ void MOIP::allowed_motifs_from_rin(args_of_parallel_func arg_struct)
     }
 }
 
- void MOIP::allowed_motifs_from_json(args_of_parallel_func arg_struct, vector<pair<uint, char>> errors_id)
+ void MOIP::allowed_motifs_from_json(motif_and_mutex arg_struct)
 {
     // Searches where to place some JSON motifs in the RNA
-     path           jsonfile                 = arg_struct.motif_file;
-     mutex&         posInsertionSites_access = arg_struct.posInsertionSites_mutex;
 
-     std::ifstream                 motif;
-     string                        filepath = jsonfile.string();
-     vector<vector<Component>>     vresults, r_vresults;
-     vector<string>                component_sequences;
-     vector<string>                component_contacts;
-     vector<string>                pdbs;
-     string                        contacts, field, seq, struct2d;
-     string                        contacts_id;
-     string                        line, filenumber;
-     string                        rna = rna_.get_seq();
-     string                        reversed_rna = rna_.get_seq();
-     size_t                        nb_contacts = 0;
-     double                        tx_occurrences = 0;
+     mutex&                    posInsertionSites_access = arg_struct.posInsertionSites_mutex;
+     json_elem                 it = arg_struct.motif;
+     vector<vector<Component>> vresults;
+     vector<string>            component_sequences;
+     string                    subseq, seq, fullseq, struct2d;
+     size_t                    end;
+ 
+     // Retrieve sequenc eand structure
+     seq = it.value()["sequence"];
+     fullseq = string(seq);
+     struct2d = it.value()["struct2d"];
+     transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
+ 
+     // Now split the sequence into components
+     while(seq.find("&") != string::npos) {
+         end = seq.find("&");
+         subseq = seq.substr(0, end);
+         seq = seq.substr(end + 1);
+         component_sequences.push_back(subseq); // new component sequence
+     } 
+     if (!seq.empty()) {
+         component_sequences.push_back(seq);
+     }
 
-     std::reverse(reversed_rna.begin(), reversed_rna.end());
+     // Place the components by pattern-matching
+     vresults = find_next_ones_in(rna_.get_seq(), 0, component_sequences);
     
-     motif = std::ifstream(filepath);
-     json js = json::parse(motif);
+     // Check if the results are possible (likeliness)
+     for (vector<Component>& v : vresults)
+     {
+         
+         // Now create proper Motifs with Motif class
+         Motif temp_motif = Motif(v, it.key(), struct2d);
 
-     string keys[5] = {"contacts", "occurences", "pdb", "sequence", "struct2d"};
-     uint it_errors = 0;
-     uint comp;
-     uint occ = 0;
-     
-     for(auto it = js.begin(); it != js.end(); ++it) {
-         contacts_id = it.key();
-         comp = stoi(contacts_id);
- 
-         // Check for known errors to ignore corresponding motifs
-         if (comp == errors_id[it_errors].first) {    
-             while (comp == errors_id[it_errors].first) {
-                 it_errors ++;
-             }
-             continue;
-         } 
- 
-         for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
-             field = it2.key(); 
-             if (!field.compare(keys[0]))  // This is the contacts field
-             {  
-                 contacts = it2.value();
-                 nb_contacts = count_contacts(contacts);
-                 component_contacts = find_components(contacts, "&");
-             } 
-             else if (!field.compare(keys[1]))    // This is the occurences field
-             {
-                 occ = it2.value();
-                 tx_occurrences = (double)occ;    // (double)max_occ;
+         if (verbose_) cout << "\t> Considering motif JSON " << temp_motif.pos_string() << "\t" 
+                            << fullseq << ", " << struct2d << " ";
 
-             } 
-             else if (!field.compare(keys[2]))    // This is the pdb field
-             {
-                 vector<string> tab = it2.value();
-                 pdbs = tab;
-             } 
-             else if (!field.compare(keys[3]))    // This is the sequence field
-             { 
-                 seq = check_motif_sequence(it2.value());
-                 component_sequences = find_components(seq, "&");
-             } 
-             else if (!field.compare(keys[4]))    // This is the struct2D field
-             {
-                 struct2d = it2.value();      
-             }     
+         // Check if the motif can be inserted, checking the basepairs probabilities and theta
+         bool unprobable = false;
+         if (!temp_motif.links_.size()) {
+             if (verbose_) cout << "discarded, no constraints on the secondary structure, it is a useless motif." << endl;
+             continue;
         }
-         vresults     = json_find_next_ones_in(rna, 0, component_sequences);
- 
-         for (vector<Component>& v : vresults)
+         if (verbose_) cout << " + links at positions ";
+         for (const Link& l : temp_motif.links_)
         {
-             if (verbose_) cout << "\t> Considering motif JSON " << contacts_id << "\t" << seq << ", " << struct2d << " ";
-             
-             Motif temp_motif = Motif(v, contacts_id, nb_contacts, tx_occurrences);
-             temp_motif.links_ = build_motif_pairs(struct2d, v);
-             temp_motif.pos_contacts = find_contacts(component_contacts, v);
- 
-             // Check if the motif can be inserted, checking the basepairs probabilities and theta
-             bool unprobable = false;
-             if (!temp_motif.links_.size()) {
-                 if (verbose_) cout << "discarded, no constraints on the secondary structure, it is a useless motif." << endl;
-                 continue;
-             }
-             if (verbose_) cout << "at position ";
-             for (const Link& l : temp_motif.links_)
-             {
-                 if (verbose_) cout << l.nts.first << ',' << l.nts.second << ' '; 
-                 if (!allowed_basepair(l.nts.first, l.nts.second)) {
-                     if (verbose_) cout << "(unlikely) ";
-                     unprobable = true;
-                 }
-             }
-             if (unprobable) {
-                 if (verbose_) cout << ", discarded because of unlikely or impossible basepairs" << endl;
-                 continue;
+             if (verbose_) cout << l.nts.first << ',' << l.nts.second << ' '; 
+             if (!allowed_basepair(l.nts.first, l.nts.second)) {
+                 if (verbose_) cout << "(unlikely) ";
+                 unprobable = true;
             }
-             if (verbose_) cout << endl;
- 
-             // Add it to the results vector
-             unique_lock<mutex> lock(posInsertionSites_access);
-             insertion_sites_.push_back(temp_motif);
-             lock.unlock();
         }
-         component_sequences.clear();
-     }
+         if (unprobable) {
+             if (verbose_) cout << ", discarded because of unlikely or impossible basepairs" << endl;
+             continue;
+         }
+         if (verbose_) cout << endl;
 
+         // Add it to the results vector
+         unique_lock<mutex> lock(posInsertionSites_access);
+         insertion_sites_.push_back(temp_motif);
+         lock.unlock();
+     }
 }
\ No newline at end of file
--- a/cppsrc/MOIP.h
View file @89aa055
+++ b/cppsrc/MOIP.h
View file @89aa055
@@ -12,12 +12,19 @@
 
 using std::vector;
 
- typedef struct args_ {
+ typedef struct argsf_ {
 						path           motif_file;
 						std::mutex&    posInsertionSites_mutex;
- 						args_(path motif_file_, mutex& mutex_) : motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
- 					  } args_of_parallel_func;
+ 						argsf_(path motif_file_, mutex& mutex_) 
+ 							: motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
+ 					  } file_and_mutex;
 
+ typedef struct argsm_ {
+ 						json_elem      motif;
+ 						std::mutex&    posInsertionSites_mutex;
+ 						argsm_(json_elem& motif_, mutex& mutex_) 
+ 							: motif(motif_), posInsertionSites_mutex(mutex_) {}
+ 					  } motif_and_mutex;
 
 class MOIP
 {
@@ -56,9 +63,9 @@ class MOIP
 
 	bool   						exists_vertical_outdated_labels(const SecondaryStructure& s) const;
 	bool   						exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
- 	void   						allowed_motifs_from_desc(args_of_parallel_func arg_struct);
- 	void   						allowed_motifs_from_rin(args_of_parallel_func arg_struct);
- 	void						allowed_motifs_from_json(args_of_parallel_func arg_struct, vector<pair<uint, char>> errors_id);
+ 	void   						allowed_motifs_from_desc(file_and_mutex arg_struct);
+ 	void   						allowed_motifs_from_rin(file_and_mutex arg_struct);
+ 	void						allowed_motifs_from_json(motif_and_mutex arg_struct);
 	
 	bool verbose_;    // Should we print things ?
 
@@ -79,8 +86,7 @@ class MOIP
 	vector<vector<size_t>> index_of_Cxip_;               // Stores the indexes of the Cxip in insertion_dv_
 	vector<size_t>         index_of_first_components;    // Stores the indexes of Cx1p in insertion_dv_
 	vector<vector<size_t>> index_of_yuv_;                // Stores the indexes of the y^u_v in basepair_dv_
- 
- 	vector<vector<size_t>>   index_of_xij_;		         //Stores the indexes of the xij variables (BioKop) in stacks_dv_
+ 	vector<vector<size_t>> index_of_xij_;		         //Stores the indexes of the xij variables (BioKop) in stacks_dv_
 };
 
 inline uint                      MOIP::get_n_solutions(void) const { return pareto_.size(); }
--- a/cppsrc/Motif.cpp
View file @89aa055
+++ b/cppsrc/Motif.cpp
View file @89aa055
@@ -5,71 +5,36 @@
 #include <regex>
 #include <sstream>
 #include <thread>
- #include <json.hpp>
 
 using namespace boost::filesystem;
 using namespace std;
 using json = nlohmann::json;
 
- Motif::Motif(void) {}
+ uint Motif::delay = 1;
 
- Motif::Motif(const vector<Component>& v, string PDB) : comp(v), PDBID(PDB)
- {
-     is_model_ = false;
-     reversed_ = false;
-     source_   = RNA3DMOTIF;
- }
- 
- Motif::Motif(const vector<Component>& v, string id, size_t nb_contacts, double tx_occurrences) : comp(v)
- {
-     contacts_id = id;
-     is_model_ = false;
-     reversed_ = false;
-     source_   = CONTACTS;
-     contact_ = nb_contacts;
-     tx_occurrences_ = tx_occurrences;
- }
+ Motif::Motif(void) {}
 
 Motif::Motif(string csv_line)
 {
     vector<string> tokens;
     split(tokens, csv_line, boost::is_any_of(","));
 
-     if (csv_line.find(string("True")) != std::string::npos or csv_line.find(string("False")) != std::string::npos) // This has been created by jar3d
+     id_ = tokens[0];
+     score_  = stoi(tokens[1]);
+     source_ = CSV;
+     reversed_ = false; // default value
+ 
+     if (csv_line.find(string("True")) != std::string::npos or csv_line.find(string("False")) != std::string::npos) 
     {
-         atlas_id = tokens[0];
+         // This has been created by jar3d
         score_   = stoi(tokens[2]);
         comp.push_back(Component(make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
         if (tokens[5] != "-") comp.push_back(Component(make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
         reversed_ = (tokens[1] == "True");
-         is_model_ = true;
-         PDBID     = "";
-         source_   = RNAMOTIFATLAS;
     }
- 
     else // this has been created by BayesPairing
     {
-         score_ = stoi(tokens[1]);
- 
-         // identify source:
-         if (tokens[0].find(string("rna3dmotif")) == std::string::npos)
-         {
-             is_model_ = true;
-             PDBID     = "";
-             source_   = RNAMOTIFATLAS;
-             atlas_id  = tokens[0];
-         }
- 
-         else
-         {
-             is_model_ = false;
-             PDBID     = tokens[0];
-             source_   = RNA3DMOTIF;
-             atlas_id  = "";
-         }
- 
         uint i = 2;
-         //while (i < tokens.size())
         while (i < tokens.size()-1)
         {
             if (stoi(tokens[i]) < stoi(tokens[i + 1]))
@@ -81,12 +46,71 @@ Motif::Motif(string csv_line)
     }
 }
 
+ Motif::Motif(const vector<Component>& v, string name) : comp(v), id_(name)
+ {
+     source_ = RNA3DMOTIF;
+     reversed_ = false;
+ }
+ 
+ Motif::Motif(const vector<Component>& v, string name, string& struc) : comp(v), id_(name)
+ {
+     source_ = JSON;
+     reversed_ = false;
+     
+     stack<uint> rbrackets;
+     stack<uint> sbrackets;
+     stack<uint> cbrackets;
+     stack<uint> chevrons;
+     
+     uint count = 0;
+     uint debut = v[count].pos.first;
+     uint gap = 0;
+ 
+     for (uint i = 0; i < struc.size(); i++) {
+         if (struc[i] == '(') {
+             rbrackets.push(i + debut + gap - count);
+         } else if (struc[i] == ')') {
+             Link l;
+             l.nts.first = rbrackets.top();
+             l.nts.second = i + debut + gap - count;
+             links_.push_back(l);
+             rbrackets.pop();
+         } else if (struc[i] == '[') {
+             sbrackets.push(i + debut + gap - count);
+         } else if (struc[i] == ']') {
+             Link l;
+             l.nts.first = sbrackets.top();
+             l.nts.second = i + debut + gap - count;
+             links_.push_back(l);
+             sbrackets.pop();
+         } else if (struc[i] == '{') {
+             cbrackets.push(i + debut + gap - count);
+         } else if (struc[i] == '}') {
+             Link l;
+             l.nts.first = cbrackets.top();
+             l.nts.second = i + debut + gap - count;
+             links_.push_back(l);
+             cbrackets.pop();
+         } else if (struc[i] == '<') {
+             chevrons.push(i + debut + gap - count);
+         } else if (struc[i] == '>') {
+             Link l;
+             l.nts.first = chevrons.top();
+             l.nts.second = i + debut + gap - count;
+             links_.push_back(l);
+             chevrons.pop();
+         } else if (struc[i] == '&') {
+             count ++;
+             gap += v[count].pos.first - v[count - 1].pos.second - 1;
+        }
+     }
+ }
+ 
 Motif::Motif(const vector<Component>& v, path rinfile, uint id, bool reversed) : comp(v), reversed_(reversed)
 {
     // Loads a motif from the RIN file of Carnaval
-     carnaval_id = to_string(id);
-     source_     = CARNAVAL;
-     is_model_     = false;
+     id_     = to_string(id);
+     source_ = CARNAVAL;
 
     std::ifstream file(rinfile);
 
@@ -158,7 +182,7 @@ Motif::Motif(const vector<Component>& v, path rinfile, uint id, bool reversed) :
 string Motif::pos_string(void) const
 {
     stringstream s;
-     s << atlas_id << " ( ";
+     s << id_ << " ( ";
     for (auto c : comp) s << c.pos.first << '-' << c.pos.second << ' ';
     s << ')';
     return s.str();
@@ -185,10 +209,9 @@ string Motif::sec_struct(void) const
 string Motif::get_identifier(void) const
 {
     switch (source_) {
-     case RNAMOTIFATLAS: return atlas_id; break;
-     case CARNAVAL: return string("RIN") + carnaval_id; break;
-     case CONTACTS: return string("JSON") + contacts_id; break;
-     default: return PDBID;
+     case CARNAVAL: return string("RIN") + id_; break;
+     case JSON: return string("JSON") + id_; break;
+     default: return id_;
     }
 }
 
@@ -265,167 +288,52 @@ char Motif::is_valid_RIN(const string& rinfile)
     return (char) 0;
 }
 
- vector<pair<uint,char>> Motif::is_valid_JSON(const string& jsonfile)
+ char Motif::is_valid_JSON(const json_elem& i)
 {
-     // returns 0 if no errors
-     std::ifstream  motif;
-     motif = std::ifstream(jsonfile);
-     json js = json::parse(motif);
-     vector<pair<uint,char>> errors_id;
+     string seq = i.value()["sequence"];
+     string ss = i.value()["struct2d"];
+ 
+     if (ss.size() != seq.size()) return 'x';
+     if (!check_motif_sequence(seq)) return 'r';
+     if (!check_motif_ss(ss)) return 'n';
+     if (seq.empty()) return 'e';
+     if (seq.size() - count(seq.begin(),seq.end(), '&') < 4) return 'l';
+     
+     // Iterate on components to check their length
+     string subseq;
     vector<string> components;
-     uint fin = 0;
- 
-     std::string keys[6] = {"contacts", "occurences", "pdb", "pfam", "sequence", "struct2d"};
- 
-     // Iterating over Motifs
-     for (auto i = js.begin(); i != js.end(); ++i) {
-         int j = 0;
-         string id = i.key();
-         size_t size;
-         string complete_seq;
-         string contacts;
- 
-         // Iterating over json keys
-         for (auto it = js[id].begin(); it != js[id].end(); ++it) {
-             // it.key() contains the key, and it.value() the field's value.
-             string curr_key = it.key();
-             
-             if (curr_key.compare(keys[j]))
-             { 
-                 //std::cout << "error header : keys[" << j << "]: " << keys[j] << " vs curr_key: " << curr_key << endl;
-                 errors_id.push_back(make_pair(stoi(id), 'd')); 
-                 //return 'd'; 
-             } 
-             else if(!curr_key.compare(keys[5])) // This is the secondary structure field
-             {
-                 string ss = it.value();
-                 if (ss.empty()) {
-                     errors_id.push_back(make_pair(stoi(id), 'f'));
-                     break;
-                 } else if (!checkSecondaryStructure(ss)) {
-                     errors_id.push_back(make_pair(stoi(id), 'n'));
-                     break;
-                 } else if (ss.size() != complete_seq.size()) {
-                     errors_id.push_back(make_pair(stoi(id), 'x'));
-                     break;
-                 }
-             } 
-             else if(!curr_key.compare(keys[0])) // This is the contacts field
-             {
-                 contacts = it.value();
-             } 
-             else if (!curr_key.compare(keys[4])) // This is the sequence field
-             {
-                 string seq = it.value();
-                 size = count_nucleotide(seq);
-                 complete_seq = seq;
-                 if (seq.empty()) {
-                     errors_id.push_back(make_pair(stoi(id), 'e'));
-                     break;
-                 } 
-                 if (size < 4) {
-                     errors_id.push_back(make_pair(stoi(id), 'l'));
-                     break;
-                 }
-                 size_t count_contact = count_delimiter(contacts);
-                 size_t count_seq = count_delimiter(seq);
-                 if (count_contact != count_seq) {
-                     errors_id.push_back(make_pair(stoi(id), 'a'));
-                     break;
-                 }
-                 if (contacts.size() != seq.size()) {
-                     errors_id.push_back(make_pair(stoi(id), 'b'));
-                     break;
-                 }
- 
-                 // Iterate on components to check their length
-                 string subseq;
-                 while((seq.find('&') != string::npos)) {
-                     fin = seq.find('&');  
-                     subseq = seq.substr(0, fin);
-                     seq = seq.substr(fin + 1);
-                     if (subseq.size() >= 2) {
-                         components.push_back(subseq); 
-                     } else {
-                         //errors_id.push_back(make_pair(stoi(id), 'k'));
-                     }
-                 }
-                 if (seq.size() >= 2) { // Last component after the last &
-                     components.push_back(seq); 
-                 } else {
-                     //errors_id.push_back(make_pair(stoi(id), 'k'));
-                 }
- 
-                 size_t n = 0;
-                 for (uint ii = 0; ii < components.size(); ii++) {
-                     n += components[ii].size();
-                 }
-                 if(n <= 3) {
-                     errors_id.push_back(make_pair(stoi(id), 'l'));
-                 }
-             }
-             j++;
-         }
+     uint end, n=0;
+     while((seq.find('&') != string::npos)) {
+         end = seq.find('&');  
+         subseq = seq.substr(0, end);
+         seq = seq.substr(end + 1);
+         if (subseq.size() >= 2) components.push_back(subseq); 
     }
-     return errors_id;
- }
+     if (seq.size() >= 2) // Last component after the last &
+         components.push_back(seq); 
+     for (auto comp : components)
+         n += comp.size();
+     if(n <= 3) return 'l';
 
- //count the number of nucleotide in the motif sequence
- size_t count_nucleotide(string& seq) {
-     size_t count = 0;
-     for(uint i = 0; i < seq.size(); i++) {
-         char c = seq.at(i);
-         if (c != '&') {
-             count++;
-         }
-     }
-     return count;
- }
- 
- //count the number of '&' in the motif sequence
- size_t count_delimiter(string& seq) {
-     size_t count = 0;
-     for(uint i = 0; i < seq.size(); i++) {
-         char c = seq.at(i);
-         if (c == '&') {
-             count++;
-         }
-     }
-     return count;
- }
- 
- //count the number of '*' in the motif
- size_t count_contacts(string& contacts) {
-     
-     size_t count = 0;
-     for (uint i = 0; i < contacts.size(); i++) {
-         if (contacts[i] == '*') {
-             count++;
-         }
-     }
-     return count;
+     return char(0);
 }
 
- //Check if the sequence is a rna sequence (ATGC) and replace T by U or remove modified nucleotide if necessary
- string check_motif_sequence(string seq) {
-     std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
-     for (int i = seq.size(); i >= 0; i--) {
-         if(seq[i] == 'T') {
-             seq[i] = 'U';
-         } else if (!(seq [i] == 'A' || seq [i] == 'U' || seq [i] == '&'
-         || seq [i] == 'G' || seq [i] == 'C')) {
-             seq = seq.erase(i,1);
+ //Check if the sequence is a rna sequence (ATUGC-only)
+ bool check_motif_sequence(string seq) {
+     transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
+     for (char c : seq)
+         if (!(c == 'A' || c == 'U' || c == 'T' || c == '&' || c == 'G' || c == 'C')) {
+             return false;
         }
-     }
-     return seq;
+     return true;
 }
 
- //check that there are as many opening parentheses as closing ones
- bool checkSecondaryStructure(string struc)
+ // Check that there are as many opening brackets as closing ones
+ bool check_motif_ss(string struc)
 { 
-     stack<uint> parentheses;
-     stack<uint> crochets;
-     stack<uint> accolades;
+     stack<uint> rbrackets;
+     stack<uint> sbrackets;
+     stack<uint> cbrackets;
     stack<uint> chevrons;
     for (uint i = 0; i < struc.length(); i++)
     {
@@ -439,27 +347,27 @@ bool checkSecondaryStructure(string struc)
         } else {
             for (uint i = 0; i < struc.size(); i++) {
                 if (struc[i] == '(') {
-                     parentheses.push(i);
+                     rbrackets.push(i);
 
                 } else if (struc[i] == ')') {
-                     if (!parentheses.empty())
-                         parentheses.pop();
+                     if (!rbrackets.empty())
+                         rbrackets.pop();
                     else return false;
 
                 } else if (struc[i] == '[') {
-                     crochets.push(i);
+                     sbrackets.push(i);
 
                 } else if (struc[i] == ']') {
-                     if (!crochets.empty())
-                         crochets.pop();
+                     if (!sbrackets.empty())
+                         sbrackets.pop();
                     else return false;
 
                 } else if (struc[i] == '{') {
-                     accolades.push(i);
+                     cbrackets.push(i);
 
                 } else if (struc[i] == '}') {
-                     if (!accolades.empty())
-                         accolades.pop();
+                     if (!cbrackets.empty())
+                         cbrackets.pop();
                     else return false;
 
                 } else if (struc[i] == '<') {
@@ -473,140 +381,7 @@ bool checkSecondaryStructure(string struc)
             }
         }
     }
-     return (parentheses.empty() && crochets.empty() && accolades.empty() && chevrons.empty());
- }
- 
- // Converts a dot-bracket motif secondary structure to vector of Link
- vector<Link> build_motif_pairs(string& struc, vector<Component>& v) {
-     
-     vector<Link> vec; 
-     stack<uint> parentheses;
-     stack<uint> crochets;
-     stack<uint> accolades;
-     stack<uint> chevrons;
-     
-     uint count = 0;
-     uint debut = v[count].pos.first;
-     uint gap = 0;
- 
-     for (uint i = 0; i < struc.size(); i++) {
-         if (struc[i] == '(') {
-             parentheses.push(i + debut + gap - count);
-         } else if (struc[i] == ')') {
-             Link l;
-             l.nts.first = parentheses.top();
-             l.nts.second = i + debut + gap - count;
-             vec.push_back(l);
-             parentheses.pop();
-         } else if (struc[i] == '[') {
-             crochets.push(i + debut + gap - count);
-         } else if (struc[i] == ']') {
-             Link l;
-             l.nts.first = crochets.top();
-             l.nts.second = i + debut + gap - count;
-             vec.push_back(l);
-             crochets.pop();
-         } else if (struc[i] == '{') {
-             accolades.push(i + debut + gap - count);
-         } else if (struc[i] == '}') {
-             Link l;
-             l.nts.first = accolades.top();
-             l.nts.second = i + debut + gap - count;
-             vec.push_back(l);
-             accolades.pop();
-         } else if (struc[i] == '<') {
-             chevrons.push(i + debut + gap - count);
-         } else if (struc[i] == '>') {
-             Link l;
-             l.nts.first = chevrons.top();
-             l.nts.second = i + debut + gap - count;
-             vec.push_back(l);
-             chevrons.pop();
-         } else if (struc[i] == '&') {
-             count ++;
-             gap += v[count].pos.first - v[count - 1].pos.second - 1;
-        }
-     }
-     return vec;
- }
- 
- uint find_max_occurrences(string& filepath) {
-     uint max = 0;
-     std::ifstream in = std::ifstream(filepath);
-     json js = json::parse(in);
-     string contacts_id;
- 
-     for(auto it = js.begin(); it != js.end(); ++it) {
-         contacts_id = it.key();
-         for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
-             string test = it2.key(); 
-              if (!test.compare("occurences")) {
-                 uint occ = it2.value();
-                 if (occ > max) {
-                     max = occ;
-                 }
-              }
-         }
-      }
-     return max;
- }
- 
- uint find_max_sequence(string& filepath) {
-     uint max = 0;
-     std::ifstream in = std::ifstream(filepath);
-     json js = json::parse(in);
-     string contacts_id;
-     string seq;
- 
-     for(auto it = js.begin(); it != js.end(); ++it) {
-         contacts_id = it.key();
-         for(auto it2 = js[contacts_id].begin(); it2 != js[contacts_id].end(); ++it2) {
-             string test = it2.key(); 
-             if (!test.compare("sequence")) {
-                 seq = it2.value();
-                 uint size = seq.size();
-                 if (size > max) {
-                     max = size;
-                 }
-             }
-         }
-      }
-     return max;
- }
- 
- vector<string> find_components(string& sequence, string delimiter) {
-     vector<string> list;
-     string seq = sequence;
-     string subseq;
-     uint fin = 0;
- 
-     while(seq.find(delimiter) != string::npos) {
-         fin = seq.find(delimiter);
-         
-         subseq = seq.substr(0, fin);
-         seq = seq.substr(fin + 1);
-         list.push_back(subseq); // new component sequence
-     } 
-     if (!seq.empty()) {
-         list.push_back(seq);
-     }
-     return list;
- }
- 
- vector<uint> find_contacts(vector<string>& struc2d, vector<Component>& v) {
-     vector<uint> positions;
-     string delimiter = "*";
-     uint debut;
-     for (uint i = 0; i < v.size(); i++) {
-         debut = v[i].pos.first;
-         uint pos = struc2d[i].find(delimiter, 0);
-         while(pos != string::npos && pos <= struc2d[i].size())
-         {   
-             positions.push_back(pos + debut);
-             pos = struc2d[i].find(delimiter, pos+1);
-         } 
-     }
-     return positions;
+     return (rbrackets.empty() && sbrackets.empty() && cbrackets.empty() && chevrons.empty());
 }
 
 bool is_desc_insertible(const string& descfile, const string& rna)
@@ -652,10 +427,9 @@ bool is_desc_insertible(const string& descfile, const string& rna)
     regex  e(seq);
 
     return regex_search(rna, m, e);
- 
 }
 
- vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc)
+ vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string> vc)
 {
     pair<uint, uint>          pos;
     vector<vector<Component>> results;
@@ -663,122 +437,41 @@ vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<stri
     vector<string>            next_seqs;
     regex                     c(vc[0]);
 
-     //cout << "\t\t>Searching " << vc[0] << " in " << rna << endl;
- 
     if (vc.size() > 1) {
-         //cout << "size vc: " << vc.sizef() << endl; 
         if (regex_search(rna, c)) {
-             if (vc.size() > 2) {
-                 next_seqs = vector<string>(&vc[1], &vc[vc.size()]);
-             }
-             else {
-                 next_seqs = vector<string>(1, vc.back());
-             }
-             uint j = 0;
-             // For every regexp match
-             for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
-                 smatch match = *i;
-                 pos.first    = match.position() + offset;
-                 pos.second   = pos.first + match.length() - 1;
- 
-                 //cout << "\t\t>Inserting " << vc[j] << " in [" << pos.first << ',' << pos.second << "]" << endl;
-                 // +5 because HL < 3 pbs but not for CaRNAval or Contacts
-                 // if CaRNAval or Contacts +2 is better
-                 if (pos.second - offset + 5 >= rna.length()) {
-                      //cout << "\t\t... but we cannot place the next components : Ignored." << endl;
-                     continue;
-                 }
-                 
-                 next_ones = find_next_ones_in(rna.substr(pos.second - offset + 5), pos.second + 5, next_seqs);
-                 if (!next_ones.size()) {
-                     // cout << "\t\t... but we cannot place the next components : Ignored.2" << endl;
-                     continue;
-                 }
-                 //cout  << endl;
-                 for (vector<Component> v : next_ones)    // For every combination of the next components
-                 {
-                     // Combine the match for this component pos with the combination
-                     // of next_ones as a whole solution
-                     vector<Component> r;
-                     r.push_back(Component(pos));
-                     for (Component& c : v) r.push_back(c);
-                     results.push_back(r);
-                 }
-                 j++;
-             }
-         }
-     } else {
-         // Only one more component to find
- 
-         if (regex_search(rna, c)) {
-             // For each regexp match
-             for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
-                 smatch match = *i;
-                 pos.first    = match.position() + offset;
-                 pos.second   = pos.first + match.length() - 1;
- 
-                 //cout << "\t\t>Inserting " << vc[0] << " in [" << pos.first << ',' << pos.second << "]" << endl;
 
-                 // Create a vector of component with one component for that match
-                 vector<Component> r;
-                 r.push_back(Component(pos));
-                 results.push_back(r);
-             }
-         }
-     }
-     return results;
- }
- 
- vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc)
- {
-     pair<uint, uint>          pos;
-     vector<vector<Component>> results;
-     vector<vector<Component>> next_ones;
-     vector<string>            next_seqs;
-     regex                     c(vc[0]);
- 
-     //cout << "\t\t>Searching " << vc[0] << " in " << rna << endl;
- 
-     if (vc.size() > 1) {
-         //cout << "size vc: " << vc.size() << endl; 
-         if (regex_search(rna, c)) {
+             // Define a vector with the remaining components to insert behind vc[0]
             if (vc.size() > 2) {
                 next_seqs = vector<string>(&vc[1], &vc[vc.size()]);
             }
             else {
                 next_seqs = vector<string>(1, vc.back());
             }
-             uint j = 0;
+ 
             // For every regexp match
             for (sregex_iterator i = sregex_iterator(rna.begin(), rna.end(), c); i != sregex_iterator(); ++i) {
+                 //retrieve the hit position
                 smatch match = *i;
                 pos.first    = match.position() + offset;
                 pos.second   = pos.first + match.length() - 1;
 
-                 //cout << "\t\t>Inserting " << vc[j] << " in [" << pos.first << ',' << pos.second << "]" << endl;
+                 // Check if we can insert the next components after this hit for vc[0]
                 // +5 because HL < 3 pbs but not for CaRNAval or Contacts
                 // if CaRNAval or Contacts +2 is better
-                 if (pos.second - offset + 2 >= rna.length()) {
-                      //cout << "\t\t... but we cannot place the next components : Ignored." << endl;
-                     continue;
-                 }
-                 
-                 next_ones = json_find_next_ones_in(rna.substr(pos.second - offset + 2), pos.second + 2, next_seqs);
-                 if (!next_ones.size()) {
-                     // cout << "\t\t... but we cannot place the next components : Ignored.2" << endl;
-                     continue;
-                 }
-                 //cout  << endl;
-                 for (vector<Component> v : next_ones)    // For every combination of the next components
+                 if (pos.second - offset + Motif::delay >= rna.length()) break; // no room for the next components. give up.
+                 next_ones = find_next_ones_in(rna.substr(pos.second - offset + Motif::delay), pos.second + Motif::delay, next_seqs);
+                 if (!next_ones.size()) break; // no matches for the next components. give up.
+ 
+                  // For every solution for the next components, combine this solution for vc[0] (pos) with the one for the following vc[i] (v)
+                 for (vector<Component> v : next_ones)
                 {
                     // Combine the match for this component pos with the combination
                     // of next_ones as a whole solution
                     vector<Component> r;
-                     r.push_back(Component(pos));
-                     for (Component& c : v) r.push_back(c);
+                     r.push_back(Component(pos));    // solution for vc[0]
+                     for (Component& c : v) r.push_back(c); // solutions for the following vc[i]
                     results.push_back(r);
                 }
-                 j++;
             }
         }
     } else {
@@ -790,8 +483,6 @@ vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector
                 pos.first    = match.position() + offset;
                 pos.second   = pos.first + match.length() - 1;
 
-                 //cout << "\t\t>Inserting " << vc[0] << " in [" << pos.first << ',' << pos.second << "]" << endl;
- 
                 // Create a vector of component with one component for that match
                 vector<Component> r;
                 r.push_back(Component(pos));
--- a/cppsrc/Motif.h
View file @89aa055
+++ b/cppsrc/Motif.h
View file @89aa055
@@ -7,6 +7,7 @@
 #include <vector>
 #include <filesystem>
 #include "rna.h"
+ #include "json.hpp"
 
 using boost::filesystem::path;
 using std::pair;
@@ -14,6 +15,8 @@ using std::string;
 using std::vector;
 using std::mutex;
 
+ typedef enum { RNA3DMOTIF = 1, CSV = 2, CARNAVAL = 3, JSON = 4 } source_type;
+ typedef nlohmann::detail::iter_impl<nlohmann::basic_json<> > json_elem;
 
 
 typedef struct Comp_ {
@@ -39,18 +42,17 @@ typedef struct Link
 
 class Motif
 {
-     public:
+   public:
     Motif(void);
     Motif(string csv_line);
-     Motif(const vector<Component>& v, string PDB);
-     Motif(const vector<Component>& v, string id, size_t contacts, double tx_occurrences);
+     Motif(const vector<Component>& v, string name);
+     Motif(const vector<Component>& v, string name, string& struc);
     Motif(const vector<Component>& v, path rinfile, uint id, bool reversed);
+     // Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
     
- 
-     Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
-     static char                             is_valid_RIN(const string& rinfile);
-     static char                             is_valid_DESC(const string& descfile);
-     static vector<pair<uint,char>>          is_valid_JSON(const string& jsonfile);
+     static char       is_valid_RIN(const string& rinfile);
+     static char       is_valid_DESC(const string& descfile);
+     static char       is_valid_JSON(const json_elem& i);
 
     string            pos_string(void) const;
     string            sec_struct(void) const;
@@ -64,39 +66,27 @@ class Motif
     double            tx_occurrences_;
     double            score_;
     bool              reversed_;
- 
-     private:
-     string carnaval_id;  // if source = CARNAVAL
-     string atlas_id;     // if source = RNAMOTIFATLAS
-     string PDBID;        // if source = RNA3DMOTIF
-     string contacts_id;  // if source = CONTACTS
-     bool   is_model_;    // Whether the motif is a model or an extracted module from a 3D structure
-     enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3, CONTACTS = 4 } source_;
+     static uint       delay;
+     // delay is the minimal shift between end of a component and begining of the next.
+     // For regular loop motifs, it should be at least 5 (because hairpins cannot be of size smaller than 5).
+     // For the general case, it could be zero, but solutions will look dirty...
+     // Higher values reduce combinatorial explosion of potential insertion sites.
+ 
+   private:
+     string id_;
+     source_type source_;
 };
 
 bool                        is_desc_insertible(const string& descfile, const string& rna);
- bool                        is_rin_insertible(const string& rinfile, const string& rna);
- bool                        is_json_insertible(const string& jsonfile, const string& rna);
+ bool                        check_motif_ss(string);
+ bool                        check_motif_sequence(string);
 
 vector<Motif>               load_txt_folder(const string& path, const string& rna, bool verbose);
 vector<Motif>               load_desc_folder(const string& path, const string& rna, bool verbose);
 vector<Motif>               load_csv(const string& path);
 vector<Motif>               load_json_folder(const string& path, const string& rna, bool verbose);
 
- vector<vector<Component>>   find_next_ones_in(string rna, uint offset, vector<string>& vc);
- vector<vector<Component>>   json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
- 
- // utilities for Json motifs
- size_t count_nucleotide(string&);
- size_t count_delimiter(string&);
- size_t count_contacts(string&);
- string check_motif_sequence(string);
- bool checkSecondaryStructure(string);
- vector<Link> build_motif_pairs(string&, vector<Component>&);
- uint find_max_occurrences(string&);
- uint find_max_sequence(string&);
- vector<string> find_components(string&, string);
- vector<uint> find_contacts(vector<string>&, vector<Component>&);
+ vector<vector<Component>>   find_next_ones_in(string rna, uint offset, vector<string> vc);
 
 // utilities to compare secondary structures:
 bool operator==(const Motif& m1, const Motif& m2);
--- a/cppsrc/SecondaryStructure.cpp
View file @89aa055
+++ b/cppsrc/SecondaryStructure.cpp
View file @89aa055
@@ -3,16 +3,12 @@
 #include <algorithm>
 #include <boost/format.hpp>
 
- #define RESET   "\033[0m"
- #define RED     "\033[31m"      /* Red */
- 
 using std::abs;
 using std::cout;
 using std::endl;
 
 SecondaryStructure::SecondaryStructure() {}
 
- 
 SecondaryStructure::SecondaryStructure(const RNA& rna)
 : objective_scores_(vector<double>(2)), n_(rna.get_RNA_length()), nBP_(0), rna_(rna)
 {
@@ -21,8 +17,6 @@ SecondaryStructure::SecondaryStructure(const RNA& rna)
 
 SecondaryStructure::SecondaryStructure(bool empty) : rna_(RNA()) { is_empty_structure = empty; }
 
- 
- 
 string SecondaryStructure::to_DBN(void) const
 {
 
@@ -100,26 +94,6 @@ string SecondaryStructure::to_DBN(void) const
     return res;
 }
 
- string structure_with_contacts(const SecondaryStructure& ss) {
-     string sequence = ss.rna_.get_seq();
-     string construct = "";
-     bool flag;
-     for (uint i = 0; i < sequence.size(); i++) {
-         flag = false;
-         for (const Motif& m : ss.motif_info_) {
-             for (uint j = 0; j < m.pos_contacts.size(); j++) {
-                 if (m.pos_contacts[j] == i) flag = true;
-             }
-         }
-         if (flag) {   
-             construct += "*";
-         } else {
-             construct += ".";
-         }
-     }
-     return construct;
- }
- 
 string SecondaryStructure::to_string(void) const
 {
     string s;
@@ -141,35 +115,11 @@ void SecondaryStructure::set_basepair(uint i, uint j)
 
 void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
 
- void colored_contacts(string sequence, vector<Motif> motif_info_) {
-     bool flag;
-     for (uint i = 0; i < sequence.size(); i++) {
-         flag = false;
-         for (const Motif& m : motif_info_) {
-             for (uint j = 0; j < m.pos_contacts.size(); j++) {
-                 if (m.pos_contacts[j] == i) flag = true;
-             }
-         }
-         if (flag) {   
-             cout << RED << sequence[i] << RESET;
-         } else {
-             cout << sequence[i];
-         }
-     }
- }
- 
 void SecondaryStructure::print(void) const
 {
-     cout << endl;
-     cout << '\t';
-     colored_contacts(rna_.get_seq(), motif_info_);
-     //rna_.get_seq() 
-     cout << endl;
+     cout << endl << '\t' << rna_.get_seq() << endl;
     string ss = to_string();
-     cout << '\t';
-     colored_contacts(ss, motif_info_);
-     //cout << ss;
-     cout << endl;
+     cout << '\t' << ss << endl;
     for (const Motif& m : motif_info_) {
         uint i = 0;
         cout << '\t';
@@ -324,7 +274,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2)
     return false;
 }
 
- 
 bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2)
 {
     // Checks wether the secondary structures are exactly the same, including the inserted motifs.
--- a/cppsrc/SecondaryStructure.h
View file @89aa055
+++ b/cppsrc/SecondaryStructure.h
View file @89aa055
@@ -57,7 +57,4 @@ inline void   SecondaryStructure::set_objective_score(int i, double s) { objecti
 inline uint   SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
 inline uint   SecondaryStructure::get_n_bp(void) const { return nBP_; }
 
- string structure_with_contacts(const SecondaryStructure& ss);
- 
- 
 #endif    //  SECONDARY_STRUCTURE_
\ No newline at end of file
--- a/cppsrc/biorseo.cpp
View file @89aa055
+++ b/cppsrc/biorseo.cpp
View file @89aa055
@@ -79,12 +79,10 @@ int main(int argc, char* argv[])
     ("jsonfolder,j", po::value<string>(&motifs_path_name), "A folder containing a custom motif library in .json format")
     ("pre-placed,x", po::value<string>(&motifs_path_name), "A CSV file providing motif insertion sites obtained with another tool.")
     ("function,f", po::value<char>(&obj_function_nbr)->default_value('B'), 
-                     "(A, B, C, D, E or F) Objective function to score module insertions:\n"
+                     "(A, B, C, or D) Objective function to score module insertions:\n"
                     "  (A) insert big modules\n  (B) light, high-order modules\n"
                     "  (C) well-scored modules\n  (D) light, high-order, well-scored\n    modules\n"
-                     "  (E, F) insert big modules with many\n   contacts with proteins, different\n   ponderations.\n"
-                     "  C and D require position scores\n  provided by --pre-placed.\n"
-                     "  E and F require protein-contact\n  information and should be\n  used only with --jsonfolder.")    
+                     "  C and D require position scores\n  provided by --pre-placed.\n")    
     ("mfe,E", "Minimize stacking energies\n  (leads to MFE extimator)")
     ("mea,A", "(default) Maximize expected accuracy\n  (leads to MEA estimator)")
     ("first-objective,c",   po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(2), 
@@ -108,8 +106,8 @@ int main(int argc, char* argv[])
                 << "Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules." << endl
                 << "Developped by Louis Becquey, 2018-2021\nLénaïc Durand, 2019\nNathalie Bernard, 2021" << endl << endl
                 << "Usage:\tYou must provide:\n\t1) a FASTA input file with -s," << endl
-                 << "\t2) a module type with --rna3dmotifs, --carnaval, --contacts or --pre-placed," << endl
-                 << "\t3) one module-based scoring function with --func A, B, C, D, E or F," << endl
+                 << "\t2) a module type with --rna3dmotifs, --carnaval, --json or --pre-placed," << endl
+                 << "\t3) one module-based scoring function with --func A, B, C, or D" << endl
                 << "\t4) one energy-based scoring function with --mfe or --mea," << endl
                 << "\t5) how to display results: in console (-v), or in a result file (-o)." << endl
                 << endl
@@ -154,19 +152,22 @@ int main(int argc, char* argv[])
         return EXIT_FAILURE;
     }
 
- 
-     /*  FIND PARETO SET  */
     string source;
+     Motif::delay = 1;
     if (vm.count("rinfolder"))
         source = "rinfolder";
-     else if (vm.count("descfolder"))
+     else if (vm.count("descfolder")) {
         source = "descfolder";
+         Motif::delay = 5;
+     }
     else if (vm.count("jsonfolder"))
         source = "jsonfolder";
     else if (vm.count("pre-placed"))
         source = "csvfile";
     else
         cerr << "ERR: no source of modules provided !" << endl;
+ 
+     /*  FIND PARETO SET  */
     
     MOIP               myMOIP = MOIP(myRNA, source, motifs_path_name.c_str(), theta_p_threshold, verbose);
     double             min, max;
@@ -243,11 +244,8 @@ int main(int argc, char* argv[])
         outfile.open(outputName);
         outfile << fa->name() << endl << fa->seq() << endl;
     
-         for (uint i = 0; i < myMOIP.get_n_solutions(); i++) {
-             outfile << myMOIP.solution(i).to_string() << endl << structure_with_contacts(myMOIP.solution(i)) << endl;
-             string str1 = myMOIP.solution(i).to_string();
- 
-         }
+         for (uint i = 0; i < myMOIP.get_n_solutions(); i++)
+             outfile << myMOIP.solution(i).to_string() << endl;
         outfile.close();
     }
 
--- a/cppsrc/program deleted 100644 → 0
View file @3b2bbb9
+++ b/cppsrc/program deleted 100644 → 0
View file @3b2bbb9
--- a/cppsrc/rna.cpp
View file @89aa055
+++ b/cppsrc/rna.cpp
View file @89aa055
@@ -50,6 +50,7 @@ RNA::RNA(string name, string seq, bool verbose)
 	int max_bp_span = 100;
 	float cutoff = 1e-6;
 	vrna_ep_t* results = vrna_pfl_fold(cseq, window_size, max_bp_span, cutoff);
+ 	vrna_ep_t* save = results; // keep the pointer to free it later
 
 	if (results != NULL)
 	{
@@ -95,7 +96,9 @@ RNA::RNA(string name, string seq, bool verbose)
 				}
 			}
 		}
- 
+ 		
+ 		// Free memory allocated by ViennaRNA
+ 		free(save);
 	}
 
 	else cerr << "NULL result returned by vrna_pfl_fold" << endl;