Merge branch 'stage_NBernard' into 'master'

Stage n bernard results See merge request !1

Merge branch 'stage_NBernard' into 'master'
Stage n bernard results See merge request !1
Louis BECQUEY
Commit 3b2bbb9956c579b13e4aeec1d96d490b97fbdd82 3b2bbb99 2 parents acf04330 97d4bf4f
Showing 75 changed files with 805 additions and 241 deletions
.gitignore
INSTALL.md
Isaure_benchmark.py
Makefile
Readme.md
benchmark.py
biorseo.py
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/Motif.cpp
cppsrc/Motif.h
cppsrc/Scripts/count_pattern.cpp
cppsrc/Scripts/delete_same_pdb.cpp
cppsrc/SecondaryStructure.cpp
cppsrc/SecondaryStructure.h
cppsrc/biorseo.cpp
cppsrc/program
cppsrc/rna.cpp
cppsrc/rna.h
data/fasta/applications.fa
--- a/.gitignore
View file @3b2bbb9
+++ b/.gitignore
View file @3b2bbb9
-.vscode/*
 .vscode
-# LaTeX temporary files
-doc/*.toc
-doc/*.bbl
-doc/*.gz
-doc/*.log
-doc/*.aux
-doc/*.blg
-doc/*.fls
-doc/*.fdb_latexmk
-
 # Docker installation temporary files
 eigen-eigen-323c052e1731
 cplex_installer_12.8_Student.bin
@@ -20,7 +9,6 @@ ViennaRNA-2.4.13
 # Compiled Object files
 obj/*
-doc/*.pdf
 data/modules/RIN/__pycache__
 # Executables
@@ -44,4 +32,4 @@ data/modules/RIN
 data/modules/ISAURE
 data/sec_structs/bpRNA-1m_90.dbn
 data/sec_structs/pseudobase++.dbn
-
+data/fasta/contacts
--- a/INSTALL.md
View file @3b2bbb9
+++ b/INSTALL.md
View file @3b2bbb9
--- a/Isaure_benchmark.py 0 → 100644
View file @3b2bbb9
+++ b/Isaure_benchmark.py 0 → 100644
View file @3b2bbb9
--- a/Makefile 100644 → 100755
View file @3b2bbb9
+++ b/Makefile 100644 → 100755
View file @3b2bbb9
@@ -9,7 +9,7 @@ CC	   = g++
 CFLAGS   = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3
 CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes
 LINKER   = g++
-LDFLAGS  = -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
+LDFLAGS  = -Wno-free-nonheap-object -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
 # change these to proper directories where each file should be
 SRCDIR   = cppsrc
@@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES)
 	$(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@
 	@echo -e "\033[00;32mCompiled "$<".\033[00m"
-doc: mainpdf supppdf
-	@echo -e "\033[00;32mLaTeX documentation rendered.\033[00m"
-
-mainpdf: doc/main_bioinformatics.tex doc/references.bib doc/bioinfo.cls doc/natbib.bst
-	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
-	cd doc; bibtex main_bioinformatics
-	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
-	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
-
-supppdf: doc/supplementary_material.tex
-	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error supplementary_material
-
 .PHONY: all
-all: $(BINDIR)/$(TARGET) doc
+all: $(BINDIR)/$(TARGET)
 .PHONY: re
 re: remove clean all
--- a/Readme.md
View file @3b2bbb9
+++ b/Readme.md
View file @3b2bbb9
@@ -19,6 +19,7 @@ THEN
 OUTPUT:
 - A set of secondary structures from the Pareto front,
 - The list of known modules inserted inplace in the corresponding structures
+- A set of positions of the nucleotides in contact with the protein represented by asterisks (only if the motifs_28-05-2021.json library is used!)
 2/ The different models
 ==================================
@@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet):
 * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*)
 * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/)
 * CaRNAval 1.0 (*Reinhartz et al, 2018*)
-* RNA-Bricks 2, RNAMC, CaRNAval 2.0, and others could theoretically be used, but are not supported (yet). You might write your own API.
+* /data/modules/ISAURE/motifs_28-05-2021.json a library of motifs from RNA linked to a protein from Isaure Chauvot de Beauchêne of LORIA laboratory
+ (contact:isaure.chauvot-de-beauchene@loria.fr)
 PATTERN MATCHING STEP
 - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards.
@@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA
 * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size.
 * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score).
 * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size.
+* **Function E** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides in the module.
+* **Function F** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides along the entire length of the RNA.
 3/ Installation
 ==================================
@@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions.
 - If you **might expect a pseudoknot, or don't know**:
     * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example:
-    `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A`
+    `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A --MEA`
     * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example:
-    `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A
+    `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A --MEA
 5/ List of Options
 ==================================
 ```
 Usage:  You must provide:
         1) a FASTA input file with -i,
-        2) a module type with --rna3dmotifs, --carnaval or --3dmotifatlas
+        2) a module type with --rna3dmotifs, --carnaval, --3dmotifatlas or --contacts
         3) one module placement method in { --patternmatch, --jar3d, --bayespairing }
-        4) one scoring function with --func A, B, C or D
+        4) one scoring function with --func A, B, C, D, E ou F
-
+	5) one estimator betwenn --MEA or --MFE
         If you are not using the Docker image: 
-        5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
+        6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
 Options:
 -h [ --help ]                   Print this help message
@@ -79,16 +83,21 @@ Options:
 --rna3dmotifs                   Use DESC modules from Djelloul & Denise, 2008
 --carnaval                      Use RIN modules from Reinharz & al, 2018
 --3dmotifatlas                  Use the HL and IL loops from BGSU's 3D Motif Atlas (updated)
+--contacts			Use the library of motifs, created from RNA sequences linked to proteins provided by I. Chauvot de Beauchene of LORIA laboratory
 -p [ --patternmatch ]           Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval)
 -j [ --jar3d ]                  Use JAR3D to place modules in the sequence (requires --3dmotifatlas)
 -b [ --bayespairing ]           Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas)
 -o [ --output=… ]               File to summarize the results
 -O [ --outputf=… ]              Folder where to output result and temp files
--f [ --func=… ]                 (A, B, C or D, default is B) Objective function to score module insertions:
+-f [ --func=… ]                 (A, B, C, D, E or F default is B) Objective function to score module insertions:
                                   (A) insert big modules (B) insert light, high-order modules
-                                  (c) insert modules which score well with the sequence
+                                  (C) insert modules which score well with the sequence
                                   (D) insert light, high-order modules which score well with the sequence.
-                                  C and D require cannot be used with --patternmatch.
+                                  C and D cannot be used with --patternmatch.
+				  (E) and (F) insert modules with a lot of nucleotides and a lot of nucleotides in contact with a proteine, and a huge number of occurences. 
+				  (E) maximize the number of contact nucleotide inside the module, while (F) maximize the number of contact nucleotide along the entire length of the RNA.
+--MEA				Use Maximum Expected Accuracy for the second objective
+--MFE				Use Minimum Free Energy based on the formula of (*Legendre et al., 2018*) for the second objective
 -c [ --first-objective=… ]      (default 1) Objective to solve in the mono-objective portions of the algorithm.
                                   (1) is the module objective given by --func, (2) is the expected accuracy of the structure.
 -l [ --limit=… ]                (default 500) Number of solutions in the Pareto set from which
@@ -113,9 +122,9 @@ Options:
                                   BiORSEO from outside the docker image. Use the FULL path.
 Examples:
-biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B
+biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA
-biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800
+biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 --MEA
-biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D
+biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA
 The allowed module/placement-method/function combinations are:
@@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are:
 --rna3dmotifs     A. B.           A. B. C. D.
 --3dmotifatlas                    A. B. C. D.     A. B. C. D.
 --carnaval        A. B.
+--contacts 	  E. F.
 ```
--- a/benchmark.py
View file @3b2bbb9
+++ b/benchmark.py
View file @3b2bbb9
@@ -29,11 +29,11 @@ import pickle
 # ================== DEFINITION OF THE PATHS ==============================
 biorseoDir = path.realpath(".")
-jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
+jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
-moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
+moipdir = "/local/local/localopt/RNAMoIP/Src/RNAMoIP.py"
-biokopdir = "/home/persalteas/Software/biokop/biokop"
+biokopdir = "/local/local/localopt/biokop/biokop"
 runDir = path.dirname(path.realpath(__file__))
 bpRNAFile = argv[1]
 PseudobaseFile = argv[2]
@@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3):
 			if not '(' in struct:
 				continue # ignore linear structures
 			if is_canonical_nts(seq) and is_canonical_bps(struct):
+				# keeps what's inside brackets at the end as the filename
 				if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct))
+				# keeps what's inside square brackets at the end as the filename
 				if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct))
+				# keeps all the header as filename
 				if header_style == 3: container.append(RNA(header[1:], header, seq, struct))
 				if '[' in struct: counter += 1
 	db.close()
@@ -1475,8 +1478,8 @@ def print_StudyCase_results():
 if __name__ == '__main__':
 	print("> Loading files...", flush=True)
-	bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile)
+	bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile, header_style=1)
-	PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile)
+	PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile, header_style=3)
 	StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1)
 	for nt, number in ignored_nt_dict.items():
--- a/biorseo.py deleted 100755 → 0
View file @acf0433
+++ b/biorseo.py deleted 100755 → 0
View file @acf0433
--- a/cppsrc/MOIP.cpp
View file @3b2bbb9
+++ b/cppsrc/MOIP.cpp
View file @3b2bbb9
--- a/cppsrc/MOIP.h
View file @3b2bbb9
+++ b/cppsrc/MOIP.h
View file @3b2bbb9
@@ -37,6 +37,7 @@ class MOIP
 	void                      	forbid_solutions_between(double min, double max);
 	IloEnv&                   	get_env(void);
 	static char               	obj_function_nbr_;    // On what criteria do you want to insert motifs ?
+	static char					obj_function2_nbr_;  // Do you want to use MEA or MFE to determine the best energy score ?
 	static uint               	obj_to_solve_;  // What objective do you prefer to solve in mono-objective portions of the algorithm ?
 	static double             	precision_;   // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
 	static bool               	allow_pk_;      // Wether we forbid pseudoknots (false) or allow them (true)
@@ -47,8 +48,12 @@ class MOIP
 	void   						define_problem_constraints(string& source);
 	size_t 						get_yuv_index(size_t u, size_t v) const;
 	size_t 						get_Cpxi_index(size_t x_i, size_t i_on_j) const;
+	size_t 						get_xij_index(size_t u, size_t v) const;
+
 	IloNumExprArg& 				y(size_t u, size_t v);    // Direct reference to y^u_v in basepair_dv_
 	IloNumExprArg& 				C(size_t x, size_t i);    // Direct reference to C_p^xi in insertion_dv_
+	IloNumExprArg& 				x(size_t u, size_t v);    // Direct reference to x_i,j in stacks_dv_
+
 	bool   						exists_vertical_outdated_labels(const SecondaryStructure& s) const;
 	bool   						exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
 	void   						allowed_motifs_from_desc(args_of_parallel_func arg_struct);
@@ -66,12 +71,16 @@ class MOIP
 	IloEnv                 env_;                         // environment CPLEX object
 	IloNumVarArray         basepair_dv_;                 // Decision variables
 	IloNumVarArray         insertion_dv_;                // Decision variables
+	IloNumVarArray         stacks_dv_;                    // Decision variables
+
 	IloModel               model_;                       // Solver for objective 1
 	IloExpr                obj1;                         // Objective function that counts inserted motifs
 	IloExpr                obj2;                         // Objective function of expected accuracy
 	vector<vector<size_t>> index_of_Cxip_;               // Stores the indexes of the Cxip in insertion_dv_
 	vector<size_t>         index_of_first_components;    // Stores the indexes of Cx1p in insertion_dv_
 	vector<vector<size_t>> index_of_yuv_;                // Stores the indexes of the y^u_v in basepair_dv_
+
+	vector<vector<size_t>>   index_of_xij_;		         //Stores the indexes of the xij variables (BioKop) in stacks_dv_
 };
 inline uint                      MOIP::get_n_solutions(void) const { return pareto_.size(); }
@@ -79,6 +88,8 @@ inline uint                      MOIP::get_n_candidates(void) const { return ins
 inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; }
 inline IloNumExprArg&            MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; }
 inline IloNumExprArg&            MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; }
+inline IloNumExprArg&            MOIP::x(size_t u, size_t v) { return stacks_dv_[get_xij_index(u, v)]; }
+
 inline SecondaryStructure        MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); }
 inline IloEnv&                   MOIP::get_env(void) { return env_; }
--- a/cppsrc/Motif.cpp
View file @3b2bbb9
+++ b/cppsrc/Motif.cpp
View file @3b2bbb9
--- a/cppsrc/Motif.h
View file @3b2bbb9
+++ b/cppsrc/Motif.h
View file @3b2bbb9
@@ -20,13 +20,7 @@ typedef struct Comp_ {
     pair<uint, uint> pos;
     size_t           k;
     string           seq_;
-    uint             nb_pairing;         
     Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
-    Comp_(pair<int, int> p, uint nb_pair) : pos(p) 
-    { 
-        k = 1 + pos.second - pos.first; 
-        nb_pairing = nb_pair;
-    }
     Comp_(uint start, uint length) : k(length)
     {
         pos.first  = start;
@@ -64,6 +58,7 @@ class Motif
     string            get_identifier(void) const;
     vector<Component> comp;
     vector<Link>      links_;
+    vector<uint>      pos_contacts;
     size_t            contact_;
     double            tx_occurrences_;
@@ -89,7 +84,19 @@ vector<Motif>               load_csv(const string& path);
 vector<Motif>               load_json_folder(const string& path, const string& rna, bool verbose);
 vector<vector<Component>>   find_next_ones_in(string rna, uint offset, vector<string>& vc);
-vector<vector<Component>>   json_find_next_ones_in(string rna, uint offset, vector<string>& vc, vector<string>& vs);
+vector<vector<Component>>   json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
+
+// utilities for Json motifs
+size_t count_nucleotide(string&);
+size_t count_delimiter(string&);
+size_t count_contacts(string&);
+string check_motif_sequence(string);
+bool checkSecondaryStructure(string);
+vector<Link> build_motif_pairs(string&, vector<Component>&);
+uint find_max_occurrences(string&);
+uint find_max_sequence(string&);
+vector<string> find_components(string&, string);
+vector<uint> find_contacts(vector<string>&, vector<Component>&);
 // utilities to compare secondary structures:
 bool operator==(const Motif& m1, const Motif& m2);
--- a/cppsrc/Scripts/count_pattern.cpp deleted 100644 → 0
View file @acf0433
+++ b/cppsrc/Scripts/count_pattern.cpp deleted 100644 → 0
View file @acf0433
--- a/cppsrc/Scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @acf0433
+++ b/cppsrc/Scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @acf0433
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
-#include <typeinfo>
-#include <set>
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-
-using namespace std;
-using json = nlohmann::json;
-
-void delete_redundant_pdb(const string& jsonfile, const string& jsontest, const string& jsonoutfile) {
-    std::ifstream lib(jsonfile);
-    std::ifstream lib2(jsontest);
-    
-    std::ofstream outfile (jsonoutfile);
-    json new_motif;
-    json new_id;
-    json js = json::parse(lib);
-    json js2 = json::parse(lib2);
-    
-    //the list of pfam lists of the motif we want to count the inclusion in other motif
-    for (auto it = js.begin(); it != js.end(); ++it) {
-        string id = it.key();
-        vector<string> list_pdbs;
-        vector<string> list_pdbs2;
-        bool is_added = true;
-
-        //cout << "id: " << id << endl;
-        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-            string test = it2.key();   
-
-            if (!test.compare("pdb")) {
-                vector<string> tab = it2.value();
-                list_pdbs = tab;
-                /*set<set<string>>::iterator iit;
-                set<string>::iterator iit2;
-                for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                    for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                        cout << *iit2 << endl;
-                    }
-                    cout << endl << endl;
-                }*/
-            } else {
-                new_id[test] = it2.value();
-            }  
-        }
-        //cout << "-------begin---------" << endl;
-        
-        for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
-            string id2 = it3.key();
-
-            //cout << "id: " << id << " / id2: " << id2 << endl;
-            for (auto it4 = js[id2].begin(); it4 != js[id2].end(); ++it4) {
-                string test = it4.key();
-                
-                if (!test.compare("pdb")) {
-                    vector<string> tab = it4.value();
-                    list_pdbs2 = tab;
-
-                    //cout << id << " / " << id2 << endl;
-                    for (uint k = 0; k < list_pdbs2.size(); k++) {
-                        if (count(list_pdbs.begin(), list_pdbs.end(), list_pdbs2[k])) {
-                            is_added = false;
-                        }
-                        //cout << list_pdbs2[k] << endl;
-                    }
-
-                } 
-                
-            }
-            //cout << endl;*/
-        }
-    
-       
-        /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
-            for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
-                cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
-            }
-        }*/
-        if (is_added) {      
-            new_id["pdb"] = list_pdbs;     
-            new_motif[id] = new_id;
-        }
-        new_id.clear();
-        //cout << "valeur: " << ite << endl;
-        /*for (uint i = 0; i < tab_struc.size() ; i++) {
-        cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
-        } */
-    }
-    outfile << new_motif.dump(4) << endl;
-    outfile.close(); 
-}
-
-int main()
-{
-    string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/bibli_test2.json";
-    string jsontest = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark_test.json";
-    string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final_test.json";
-    delete_redundant_pdb(jsonfile, jsontest, out);
-    return 0;
-}
-    
--- a/cppsrc/SecondaryStructure.cpp
View file @3b2bbb9
+++ b/cppsrc/SecondaryStructure.cpp
View file @3b2bbb9
@@ -3,11 +3,13 @@
 #include <algorithm>
 #include <boost/format.hpp>
+#define RESET   "\033[0m"
+#define RED     "\033[31m"      /* Red */
+
 using std::abs;
 using std::cout;
 using std::endl;
-
 SecondaryStructure::SecondaryStructure() {}
@@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const
     return res;
 }
+string structure_with_contacts(const SecondaryStructure& ss) {
+    string sequence = ss.rna_.get_seq();
+    string construct = "";
+    bool flag;
+    for (uint i = 0; i < sequence.size(); i++) {
+        flag = false;
+        for (const Motif& m : ss.motif_info_) {
+            for (uint j = 0; j < m.pos_contacts.size(); j++) {
+                if (m.pos_contacts[j] == i) flag = true;
+            }
+        }
+        if (flag) {   
+            construct += "*";
+        } else {
+            construct += ".";
+        }
+    }
+    return construct;
+}
+
 string SecondaryStructure::to_string(void) const
 {
     string s;
@@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j)
 void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
-
+void colored_contacts(string sequence, vector<Motif> motif_info_) {
+    bool flag;
+    for (uint i = 0; i < sequence.size(); i++) {
+        flag = false;
+        for (const Motif& m : motif_info_) {
+            for (uint j = 0; j < m.pos_contacts.size(); j++) {
+                if (m.pos_contacts[j] == i) flag = true;
+            }
+        }
+        if (flag) {   
+            cout << RED << sequence[i] << RESET;
+        } else {
+            cout << sequence[i];
+        }
+    }
+}
 void SecondaryStructure::print(void) const
 {
     cout << endl;
-    cout << '\t' << rna_.get_seq() << endl;
+    cout << '\t';
-    cout << '\t' << to_string() << endl;
+    colored_contacts(rna_.get_seq(), motif_info_);
+    //rna_.get_seq() 
+    cout << endl;
+    string ss = to_string();
+    cout << '\t';
+    colored_contacts(ss, motif_info_);
+    //cout << ss;
+    cout << endl;
     for (const Motif& m : motif_info_) {
         uint i = 0;
         cout << '\t';
--- a/cppsrc/SecondaryStructure.h
View file @3b2bbb9
+++ b/cppsrc/SecondaryStructure.h
View file @3b2bbb9
@@ -30,7 +30,6 @@ class SecondaryStructure
     string to_DBN() const;
     string to_string() const;
-
     vector<double> objective_scores_;       // values of the different objective functions for that SecondaryStructure
     vector<pair<uint, uint>> basepairs_;    // values of the decision variable of the integer program
     vector<Motif> motif_info_;    // information about known motives in this secondary structure and their positions
@@ -58,5 +57,7 @@ inline void   SecondaryStructure::set_objective_score(int i, double s) { objecti
 inline uint   SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
 inline uint   SecondaryStructure::get_n_bp(void) const { return nBP_; }
+string structure_with_contacts(const SecondaryStructure& ss);
+
 #endif    //  SECONDARY_STRUCTURE_
\ No newline at end of file
--- a/cppsrc/biorseo.cpp
View file @3b2bbb9
+++ b/cppsrc/biorseo.cpp
View file @3b2bbb9
--- a/cppsrc/program 0 → 100644
View file @3b2bbb9
+++ b/cppsrc/program 0 → 100644
View file @3b2bbb9
--- a/cppsrc/rna.cpp
View file @3b2bbb9
+++ b/cppsrc/rna.cpp
View file @3b2bbb9
@@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose)
 			pij_(results->i-1,results->j-1) = results->p;
 			results++;
 		}
+
+		/*define type_*/
+		type_ = vector<vector<int>>(n_, vector<int>(n_));
+		for(uint i = 0; i < n_; i++){
+			for(uint j = 0; j < n_; j++){
+				if (i < j){
+					std::stringstream ss;
+					ss << seq_[i] << seq_[j];
+					std::string str = ss.str();
+					if(str.compare("AU") == 0 ){
+						type_[i][j] = 1;
+					}
+					else if(str.compare("CG") == 0 ){
+						type_[i][j] = 2;
+					
+					}
+					else if(str.compare("GC") == 0 ){
+						type_[i][j] = 3;
+					}
+					else if(str.compare("GU") == 0 ){
+						type_[i][j] = 4;
+					}
+					else if(str.compare("UG") == 0 ){
+						type_[i][j] = 5;
+					}
+					else if(str.compare("UA") == 0 ){
+						type_[i][j] = 6;
+					}
+					else{
+						type_[i][j] = 0;
+					}
+				}
+				else{
+					type_[i][j] = 0;
+				}
+			}
+		}
+
 	}
 	else cerr << "NULL result returned by vrna_pfl_fold" << endl;
 }
-
 void RNA::print_basepair_p_matrix(float theta) const
 {
 	cout << endl;
--- a/cppsrc/rna.h
View file @3b2bbb9
+++ b/cppsrc/rna.h
View file @3b2bbb9
@@ -32,6 +32,8 @@ class RNA
     uint   get_RNA_length(void) const;
     void   print_basepair_p_matrix(float theta) const;
+    vector<vector<int>> get_type();
+
     bool verbose_;    // Should we print things ?
     private:
@@ -41,10 +43,15 @@ class RNA
     string   seq_;     // sequence of the rna with chars
     uint     n_;       // length of the rna
     MatrixXf pij_;     // matrix of basepair probabilities
+
+    vector<vector<int>> type_;  //vector of base pair types
 };
 inline float  RNA::get_pij(int i, int j) { return pij_(i, j); }
 inline uint   RNA::get_RNA_length() const { return n_; }
 inline string RNA::get_seq(void) const { return seq_; }
+inline vector<vector<int>>  RNA::get_type() { return type_; }
+
+
 #endif
--- a/data/fasta/applications.fa 100644 → 100755
View file @3b2bbb9
+++ b/data/fasta/applications.fa 100644 → 100755
View file @3b2bbb9
->__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376)
+>test_CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE__PDB_00376
-GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
+GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
->__'GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE_'_(PDB_01023)
+>test_GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE__PDB_01023
-GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
+GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
->__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857)
+>test_SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA__PDB_00857
-GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
+GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
\ No newline at end of file
--- a/data/fasta/example.fa 100644 → 100755
View file @3b2bbb9
+++ b/data/fasta/example.fa 100644 → 100755
View file @3b2bbb9
--- a/data/fasta/motif_test.fa deleted 100644 → 0
View file @acf0433
+++ b/data/fasta/motif_test.fa deleted 100644 → 0
View file @acf0433
-> JSON1000_extended
-AAUAUCCGGGCGUUUAAUCCCGGGAUAAA
\ No newline at end of file
--- a/data/modules/ISAURE/Readme.md 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/Readme.md 0 → 100644
View file @3b2bbb9
+The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA 
+laboratory. These motifs are made up of RNA fragments linked to proteins.
+==================================================================================================================
+
+Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
+The current scripts were created based on this file, and doesn't work with the other older libraries.
+
+There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
+It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one 
+contains 130 RNA.
+
+The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'. 
+They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
+
+The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
+the 'motifs_06-06-2021.json' motifs file.
+This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A 
+is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
+
+The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
+the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
+
+
+
+
+
+
--- a/data/modules/ISAURE/benchmark.dbn 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.dbn 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark.txt 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.txt 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark_16-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark_16-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark_16-07-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark_16-07-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_01-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_01-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_06-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_06-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_28-05-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_28-05-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_final.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_final.json 0 → 100644
View file @3b2bbb9
--- a/data/sec_structs/RNAMoIP_dataset.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/RNAMoIP_dataset.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/applications.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/applications.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/bpRNA-1m_90_short.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/bpRNA-1m_90_short.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/example.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/example.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/nothing.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/nothing.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/pseudobase++_short.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/pseudobase++_short.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/pseudoknots.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/pseudoknots.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/verified_secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/verified_secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
--- a/doc/Benchmark_unconstrained.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Benchmark_unconstrained.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/Nmotifs.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Nmotifs.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/Nsol.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Nsol.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/OUP_First_SBk_Bot_8401.eps deleted 100644 → 0
View file @acf0433
+++ b/doc/OUP_First_SBk_Bot_8401.eps deleted 100644 → 0
View file @acf0433
--- a/doc/algorithm2e.sty deleted 100644 → 0
View file @acf0433
+++ b/doc/algorithm2e.sty deleted 100644 → 0
View file @acf0433
--- a/doc/bioinfo.cls deleted 100644 → 0
View file @acf0433
+++ b/doc/bioinfo.cls deleted 100644 → 0
View file @acf0433
--- a/doc/fig/Benchmark_avg.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/Benchmark_avg.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/fig/MOIP_subopt.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/MOIP_subopt.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/fig/RNA_SSE.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/RNA_SSE.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_B.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_B.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_C.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_C.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_D.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_D.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/pseudoknots.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/pseudoknots.png deleted 100644 → 0
View file @acf0433
--- a/doc/graph_abstract.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/graph_abstract.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/kernels_A.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/kernels_A.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/main_bioinformatics.tex deleted 100644 → 0
View file @acf0433
+++ b/doc/main_bioinformatics.tex deleted 100644 → 0
View file @acf0433
--- a/doc/supplementary_materials.tex deleted 100644 → 0
View file @acf0433
+++ b/doc/supplementary_materials.tex deleted 100644 → 0
View file @acf0433
--- a/scripts/Compare_energy_results.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/Compare_energy_results.py 0 → 100644
View file @3b2bbb9
+from math import sqrt, ceil
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import seaborn as sns
+import pandas as pd
+import matplotlib.pylab as plt
+
+# Retrieve for each rna the best value for MEA and compare this energy value with the one obtains with
+# RNAeval and RNAfold from the ViennaRNA Package 2.0 (Ronny Lorentz et al., 2011)
+# After getting those values, it will creates a figure.
+def get_result_MEA(filename):
+    ext = "json_pmE"
+    file2 = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/" + filename + ext, "r")
+
+    name = file2.readline()
+    rna = file2.readline()
+    twod = file2.readline()
+    pred = re.findall(r'\S+', twod)
+
+    score = '-' + pred[len(pred)-1]
+    min = float(score)
+    contacts = file2.readline()
+    while twod:
+        twod = file2.readline()
+        pred = re.findall(r'\S+', twod)
+        if len(pred) > 0:
+            score = '-' + pred[len(pred) - 1]
+            if float(score) < min:
+                min = float(score)
+        contacts = file2.readline()
+    file2.close()
+    return min
+
+fileMFE = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAfold_bm.log", "r")
+lineRna = fileMFE.readline()
+lineStruct = fileMFE.readline()
+
+fileEval = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAeval_bm.log", "r")
+lineRna2 = fileEval.readline()
+lineStruct2 = fileEval.readline()
+
+file = open("/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn", "r")
+name = file.readline().strip()
+rna = file.readline()
+twod = file.readline()
+contacts = file.readline()
+list_name = []
+list_score = []
+list_type = []
+print(np)
+while name:
+    #print(name)
+    if lineRna != rna:
+        while lineRna != rna:
+            lineRna = fileMFE.readline()
+            lineStruct = fileMFE.readline()
+    MFE = float(lineStruct[len(lineStruct)-8:len(lineStruct)-2])
+    list_name.append(name[5:len(name)-1])
+    list_score.append(MFE)
+    list_type.append('MFE')
+    #print("MFE:" + str(MFE))
+    lineRna = fileMFE.readline()
+    lineStruct = fileMFE.readline()
+
+    if lineRna2 != rna:
+        while lineRna2 != rna:
+            lineRna2 = fileEval.readline()
+            lineStruct2 = fileEval.readline()
+    eval = float(lineStruct2[len(lineStruct2)-8:len(lineStruct2)-2])
+    list_name.append(name[5:len(name) - 1])
+    list_score.append(eval)
+    list_type.append('eval')
+    #print("Eval:" + str(eval))
+    lineRna2 = fileEval.readline()
+    lineStruct2 = fileEval.readline()
+
+    best_mea = get_result_MEA(name)
+    #print("MEA: " + str(best_mea) + "\n")
+    list_name.append(name[5:len(name) - 1])
+    list_score.append(best_mea)
+    list_type.append('MEA')
+    name = file.readline().strip()
+    rna = file.readline()
+    twod = file.readline()
+    contacts = file.readline()
+
+file.close()
+fileMFE.close()
+fileEval.close()
+
+'''print(list_MFE)
+print(list_MEA)
+print(list_eval)'''
+
+#np = [["rna", "type_score", "score"]]
+d = {'rna':list_name,'score':list_score, 'type_score':list_type}
+df = pd.DataFrame(d, columns=['rna','type_score','score'])
+
+sns.stripplot(x="rna",y="score",data=df,jitter=True,hue='type_score',palette='Set1')
+plt.xticks(rotation=90)
+plt.savefig("compare_BiORSEOMEA_RNAeval_RNAfold.png")
+
+
--- a/scripts/add_delimiter.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/add_delimiter.cpp 0 → 100644
View file @3b2bbb9
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+#include <typeinfo>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+using namespace std;
+using json = nlohmann::json;
+
+//Count the number of '&' in the motif sequence
+size_t count_delimiter(string& seq) {
+    size_t count = 0;
+    for(uint i = 0; i < seq.size(); i++) {
+        char c = seq.at(i);
+        if (c == '&') {
+            count++;
+        }
+    }
+    return count;
+}
+
+/*
+If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts', 
+th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
+*/
+void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
+    std::ifstream lib(jsonfile);
+    
+    std::ofstream outfile (jsonoutfile);
+    json new_motif;
+    json new_id;
+
+    json js = json::parse(lib);
+    
+    //the list of pfam lists of the motif we want to count the inclusion in other motif
+    for (auto it = js.begin(); it != js.end(); ++it) {
+        string id = it.key();
+        string test;
+        string sequence;
+        string contacts;
+        bool is_change = false;
+
+        //cout << "id: " << id << endl;
+        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+            test = it2.key();
+
+            if (!test.compare("sequence")) {
+                //cout << "sequence: " << it2.value() << endl;
+                sequence = it2.value();
+                new_id[test] = it2.value();
+  
+            } else if (!test.compare("contacts") ) {
+                contacts = it2.value();
+            } else {
+                new_id[test] = it2.value();
+            }  
+        }
+        string tmp = "";
+        if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
+            for (uint i = 0; i < sequence.size(); i++) {
+                if (sequence.at(i) == '&') {
+                    tmp += "&";
+                } else {
+                    tmp += contacts.at(i);
+                }
+            }
+        } else {
+            tmp = contacts;
+        }
+        new_id["contacts"] = tmp;
+        new_motif[id] = new_id;
+        new_id.clear();
+    }
+    outfile << new_motif.dump(4) << endl;
+    outfile.close();
+    
+}
+
+int main()
+{
+    string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
+    string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
+    add_delimiter(jsonfile, out);
+    return 0;
+}
+    
--- a/scripts/benchmark.py
View file @3b2bbb9
+++ b/scripts/benchmark.py
View file @3b2bbb9
@@ -29,7 +29,7 @@ import pickle
 # ================== DEFINITION OF THE PATHS ==============================
 biorseoDir = path.realpath(".")
-jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
+jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
 moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
@@ -803,7 +803,7 @@ class Method:
 			else:
 				results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}"
 				c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"]
-			c += ["-o", results_file, "--func", self.func]
+			c += ["-o", results_file, "--func", self.func, "--MFE"]
 			if not self.allow_pk:
 				c += ["-n"]
 			self.joblist.append(Job(command=c, priority=4, timeout=3600, 
--- a/cppsrc/Comptage des occurences/count_pattern.cpp → scripts/count_pattern.cpp
View file @3b2bbb9
+++ b/cppsrc/Comptage des occurences/count_pattern.cpp → scripts/count_pattern.cpp
View file @3b2bbb9
@@ -11,6 +11,12 @@
 using namespace std;
 using json = nlohmann::json;
+/*
+This script count the number of "occurrences" of the motif.
+So we consider that if the sequence of pattern A is included in pattern B,
+then for each inclusion of B we also have an inclusion of A. And vice versa.
+*/
+
 //Return true if the first sequence seq1 is included in the second sequence seq2
 //if not return false
 int is_contains(string& seq1, string& seq2) {
@@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) {
 //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B,
 //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list.
+//The pattern A is counted only once in every other pattern, i.e. even if the sequence of A is found several times in B,
+// it will be added only once in the occurrences of A.
 void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
     std::ifstream lib(jsonfile);
     std::ifstream lib2(jsonfile);
@@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
             if (!test.compare("pfam")) {
                 vector<vector<string>> tab = it2.value();
                 list_pfams = tab;
-                /*set<set<string>>::iterator iit;
-                set<string>::iterator iit2;
-                for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                    for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                        cout << *iit2 << endl;
-                    }
-                    cout << endl << endl;
-                }*/
             } else if (!test.compare("sequence")) {
                 //cout << "sequence: " << it2.value() << endl;
                 sequence = it2.value();
@@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                 new_id[test] = it2.value();
             }  
         }
-        //cout << "-------begin---------" << endl;
         for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
             string id2 = it3.key();
@@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                     if (!test.compare("pfam")) {
                         vector<vector<string>> tab = it4.value();
                         list_pfams2 = tab;
-                        /*for (uint k = 0; k < tab2.size(); k++) {
-                            for (uint l = 0; l < tab2[k].size(); l++) {
-                                pfams2.insert(tab2[k][l]);
-                            }
-                            list_pfams2.insert(pfams);
-                            pfams2.clear();
-                        }*/
-            
-                        /*set<set<string>>::iterator iit;
-                        set<string>::iterator iit2;
-                        for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                            for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                                cout << *iit2 << endl;
-                            }
-                            cout << endl << endl;
-                        }*/
                     } else if (!test.compare("occurences")) {
                         occurences2 = it4.value();
                         //cout << "occurences2: "<< occurences2 << endl;
@@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                                         }    
                                     }
-                                    //cout << "----end----" << endl;
                                 //}
                             }
                             if(flag) {
@@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
             //cout << endl;*/
         }
-       
-        /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
-            for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
-                cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
-            }
-        }*/
         new_id["occurences"] = list_pfams.size();
-        new_id["pfam"] = list_pfams;
+        new_id["pfam"] = list_pfams;        
-                        
-        //cout << "-------ending---------" << endl;
         new_motif[id] = new_id;
         new_id.clear();
-        //cout << "valeur: " << ite << endl;
+
-        /*for (uint i = 0; i < tab_struc.size() ; i++) {
-        cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
-        } */
     }
     outfile << new_motif.dump(4) << endl;
     outfile.close();
@@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
 int main()
 {
-    //183
+
-    //cout << "------------------BEGIN-----------------" << endl;
+    string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
-    string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_06-06-2021.json";
+    string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
-    string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
     counting_occurences(jsonfile, out);
-    //cout << "------------------END-----------------" << endl;
     return 0;
 }
--- a/scripts/create_files.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/create_files.cpp 0 → 100644
View file @3b2bbb9
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+#include <typeinfo>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+using namespace std;
+using json = nlohmann::json;
+
+/*
+Create a .fasta file for each of the sequence inside the benchmark in json format.
+Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
+Those files are useful for the Isaure_benchmark.py script.
+*/
+void create_files(const string& jsonmotifs) {
+    std::ifstream lib(jsonmotifs);
+    string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
+    string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
+    string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
+    std::ofstream outlist (list);
+    std::ofstream outdbn (dbn);
+    json js = json::parse(lib);
+    uint count = 0;
+
+    for (auto it = js.begin(); it != js.end(); ++it) {    
+        string id = it.key();
+        string name, seq, contacts, structure;
+        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+            string chain = it2.key();
+            if (chain.compare("pfams") != 0) {
+                string name = id + "_" + chain;
+                string filename = fasta + name + ".fa";
+                std::ofstream outfasta (filename);
+                outfasta << ">test_" << name << endl;
+                for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {     
+                    string field = it3.key();
+                    if (!field.compare("sequence")) {
+                        seq = it3.value();
+                        outfasta << seq.substr(0,seq.size()) << endl;
+                        outfasta.close();
+
+                    } else if (!field.compare("contacts")) {
+                        contacts = it3.value();
+
+                    } else if (!field.compare("struct2d")) {
+                        structure = it3.value();
+                    }
+                }
+                if(seq.find('&') == string::npos) {
+                    outlist << ">test_" << name << endl;
+                    outdbn << "test_" << name << "." << endl;
+                    outlist << contacts << endl;
+                    outdbn << seq << endl;
+                    outdbn << structure << endl;
+                    outdbn << contacts << endl;
+                    outlist << seq << endl;
+                    outlist << structure << endl;      
+                    count++;       
+                }
+            }
+        }
+    }
+    cout << count << " sequences en tout" << endl;
+    lib.close();
+    outlist.close();
+    outdbn.close();
+}
+
+int main()
+{
+    string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
+    string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json"; 
+    create_files(jsonbm);
+
+    return 0;
+}
+    
--- a/scripts/delete_same_pdb.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/delete_same_pdb.cpp 0 → 100644
View file @3b2bbb9
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include "/local/local/BiorseoNath/cppsrc/json.hpp"
+#include <typeinfo>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+#include <string>
+
+using namespace std;
+using json = nlohmann::json;
+
+/*
+This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
+with BiORSEO.
+*/
+void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
+    std::ifstream lib(jsonlibrary);
+    
+    std::ofstream outfile (jsonoutfile);
+    json new_motif;
+    json new_id;
+    json js = json::parse(lib);
+    
+    for (auto it = js.begin(); it != js.end(); ++it) {
+        string id = it.key();
+        vector<string> list_pdbs;
+        bool is_added = true;
+
+        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+            string field = it2.key();   
+
+            if (!field.compare("pdb")) {
+                vector<string> tab = it2.value();
+                list_pdbs = tab;
+            } else {
+                new_id[field] = it2.value();
+            }  
+        }
+
+        if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
+            is_added = false;
+        }
+        if (is_added) {      
+            new_id["pdb"] = list_pdbs;     
+            new_motif[id] = new_id;
+        }
+        new_id.clear();
+    }
+    outfile << new_motif.dump(4) << endl;
+    outfile.close(); 
+}
+
+int main(int argc, char** argv)
+{
+    string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
+    string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
+    string name = argv[1];
+    delete_redundant_pdb(jsonlibrary, name, out);
+    return 0;
+}
+    
--- a/scripts/pareto_visualizer.py
View file @3b2bbb9
+++ b/scripts/pareto_visualizer.py
View file @3b2bbb9
@@ -28,17 +28,18 @@
 from math import sqrt
 import numpy as np
 import matplotlib.pyplot as plt
-from matplotlib import cm 
+from matplotlib import cm
 import scipy.stats as st
 import sys
 import os
 import subprocess
 import getopt
+
 class SecStruct:
     def __init__(self, dot_bracket, obj1_value, obj2_value):
         self.dbn = dot_bracket
-        self.objectives = [ obj1_value, obj2_value ]
+        self.objectives = [obj1_value, obj2_value]
         self.basepair_list = self.get_basepairs()
         self.length = len(dot_bracket)
@@ -96,9 +97,9 @@ class SecStruct:
         tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
         # Compute MCC
-        if (tp+fp == 0):
+        if (tp + fp == 0):
             print("We have an issue : no positives detected ! (linear structure)")
-        return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
+        return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
 class Pareto:
@@ -106,16 +107,16 @@ class Pareto:
         self.predictions = list_of_structs
         self.true_structure = reference
         self.n_pred = len(list_of_structs)
-        self.max_obj1 = max([ s.objectives[0] for s in self.predictions ])
+        self.max_obj1 = max([s.objectives[0] for s in self.predictions])
-        self.max_obj2 = max([ s.objectives[1] for s in self.predictions ])
+        self.max_obj2 = max([s.objectives[1] for s in self.predictions])
         self.index_of_best = self.find_best_solution()
-        
+
     def find_best_solution(self):
         # returns the index of the solution of the Pareto set which is the closest
         # to the real 2D structure (the one with the max MCC)
         max_i = -1
         max_mcc = -1
-        for i,s in enumerate(self.predictions):
+        for i, s in enumerate(self.predictions):
             mcc = s.get_MCC_with(self.true_structure)
             if mcc > max_mcc:
                 max_mcc = mcc
@@ -125,15 +126,15 @@ class Pareto:
     def get_normalized_coords(self):
         # retrieves the objective values of the best solution and normlizes them
         coords = self.predictions[self.index_of_best].objectives
-        if self.max_obj1: # avoid divide by zero if all solutions are 0
+        if self.max_obj1:  # avoid divide by zero if all solutions are 0
-            x = coords[0]/self.max_obj1
+            x = coords[0] / self.max_obj1
         else:
             x = 0.5
-        if self.max_obj2: # avoid divide by zero if all solutions are 0
+        if self.max_obj2:  # avoid divide by zero if all solutions are 0
-            y = coords[1]/self.max_obj2
+            y = coords[1] / self.max_obj2
         else:
             y = 0.5
-        return ( x, y )
+        return (x, y)
 class RNA:
@@ -145,6 +146,8 @@ class RNA:
 ignored_nt_dict = {}
+
+
 def is_canonical_nts(seq):
     for c in seq[:-1]:
         if c not in "ACGU":
@@ -155,6 +158,7 @@ def is_canonical_nts(seq):
             return False
     return True
+
 def is_canonical_bps(struct):
     if "()" in struct:
         return False
@@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3):
     db.close()
     return container, pkcounter
+
 def parse_biokop(folder, basename, ext=".biok"):
     solutions = []
     err = 0
@@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"):
             err = 1
     return None, err
+
 def parse_biorseo(folder, basename, ext):
     solutions = []
     err = 0
@@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext):
             err = 1
     return None, err
+
 def prettify_biorseo(code):
     name = ""
     if "bgsu" in code:
@@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
         print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
     print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped))
-    x = np.array([ p[0] for p in points ])
+    x = np.array([p[0] for p in points])
-    y = np.array([ p[1] for p in points ])
+    y = np.array([p[1] for p in points])
     xmin, xmax = 0, 1
     ymin, ymax = 0, 1
     xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
@@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
     ax[pos].axvline(x=1, alpha=0.2, color='black')
     ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
     ax[pos].scatter(x, y, s=25, alpha=0.1)
-    ax[pos].set_xlim((-0.1,1.1))
+    ax[pos].set_xlim((-0.1, 1.1))
-    ax[pos].set_ylim((-0.1,1.1))
+    ax[pos].set_ylim((-0.1, 1.1))
-    ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer)-skipped)+" RNAs)", (0.08,0.15))
+    ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
+    ax[pos].annotate("(" + str(len(points)) + '/' + str(len(RNAcontainer)-skipped) + " RNAs)", (0.08, 0.15))
     ax[pos].set_xlabel(xlabel)
     ax[pos].set_ylabel(ylabel)
     if nsolutions:
-        ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
+        ax[pos + 1].hist(sizes, bins=range(0, max(sizes) + 1, 2), histtype='bar')
-        ax[pos+1].set_xlim((0,max(sizes)+2))
+        ax[pos + 1].set_xlim((0, max(sizes) + 2))
-        ax[pos+1].set_xticks(range(0, max(sizes), 10))
+        ax[pos + 1].set_xticks(range(0, max(sizes), 10))
-        ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
+        ax[pos + 1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
-        ax[pos+1].set_xlabel("# solutions")
+        ax[pos + 1].set_xlabel("# solutions")
-        ax[pos+1].set_ylabel("# RNAs")
+        ax[pos + 1].set_ylabel("# RNAs")
+
 if __name__ == "__main__":
     try:
--- a/scripts/pareto_visualizer_json.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json.py 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_1.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_1.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MEA_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MEA_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MFE_MEA_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MFE_MEA_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MFE_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MFE_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/selecting_id.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/selecting_id.cpp 0 → 100644
View file @3b2bbb9
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+#include <typeinfo>
+#include <set>
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+using namespace std;
+using json = nlohmann::json;
+
+/*
+That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
+*/
+
+//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
+struct data { 
+    //the pdb code (in the name of the sequence)
+    string pdb;
+    //the complete sequence with this pdb code
+    string seq_pdb;
+    //the id of the motif corresponding to this pdb in the library
+    string id;
+    //the module sequence with the components of this motif with the above id
+    string cmp;
+};
+typedef struct data data;
+
+//returns the list of pdb codes and the corresponding information from the benchmark file.
+vector<data> get_list_pdb_benchmark(const string& benchmark) {
+
+    fstream bm(benchmark);
+    vector<data> list_pdb_seq;
+    if (bm.is_open()) {
+        string name;
+        string sequence;
+        string structure;
+        string contacts;
+
+        while (getline(bm, name)) {
+            data d;
+            int size = name.size();
+            name = name.substr(5,size-6); 
+            getline(bm, sequence);
+            d.pdb = name;
+            d.seq_pdb = sequence;
+            list_pdb_seq.push_back(d);
+
+            getline(bm, structure);
+            getline(bm, contacts);
+        }
+        bm.close();
+    }
+    return list_pdb_seq;
+}
+
+string trim(string str) {
+    int size = str.size();
+    str = str.substr(1, size-2);
+    return str;
+}
+
+//store the corresponding id and motif to the sequence from the benchmark file
+data find_id_pattern(string& pdb_pattern, const string& benchmark) {
+    vector<data> l = get_list_pdb_benchmark(benchmark);
+    int size = l.size();
+
+    for (data d : l) {
+        string cmp = d.pdb;
+        cmp = cmp.substr(0, d.pdb.size()-2);
+        if (!cmp.compare(pdb_pattern)) {
+            return d;
+        }
+    }
+    return data();
+}
+
+//Create an array of data ('association'), which consists of each pdb of the benchmark file
+// with the associated pattern from this sequence.
+vector<data> find_id(const string& bibli, const string& benchmark) {
+    ifstream lib(bibli);
+    json js = json::parse(lib);
+
+    //nam seq_bm et id seq_id
+    vector<data> association;
+    
+    for (auto it = js.begin(); it != js.end(); ++it) {  
+        string id = it.key();
+        data d;
+
+        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { 
+            string field = it2.key();
+            string seq;
+            if (!field.compare("pdb")) {
+                int n = js[id][field].size();
+                for (int i = 0; i < n ; i++) {
+                    ostringstream stream;
+                    stream << js[id][field][i];
+                    string pdb = trim(stream.str());
+                    
+                    d = find_id_pattern(pdb, benchmark);
+                }
+            }
+
+            if (!field.compare("sequence")) {
+                seq = it2.value();
+
+                if (!(d.pdb.empty())) {                    
+                    d.id = id;
+                    d.cmp = seq;
+                    association.push_back(d);
+                }
+            }
+        }
+    }
+    lib.close();
+    cout << association.size() << endl;
+    return association;
+}
+
+//check if the motif is found matching with a complete sequence from a benchmark file.
+bool does_it_match(const string& seq, const string& seq_motif) {
+    size_t found = seq_motif.find("&");
+    size_t size = seq_motif.size();
+    vector<string> list_cmp;
+    if (found != std::string::npos) {
+        int count = 1;
+        
+        string cmp = seq_motif.substr(0, found);
+        list_cmp.push_back(cmp);
+        while(found != std::string::npos) {
+            size_t begin = found;
+            found = seq_motif.find("&", found + 1);
+            cmp = seq_motif.substr(begin+1, found-begin-1);
+            list_cmp.push_back(cmp);
+            count++;
+        }
+
+        found = seq.find(list_cmp[0]);
+        int count2 = 1;
+        while((found != std::string::npos) && (count2 < count)) {
+            size_t begin = found;
+            found = seq.find(list_cmp[count2], found + 1);
+            count2++;
+        }
+
+        if(count == count2) {
+            return true;
+        }
+
+    } else {
+        found = seq.find(seq_motif);
+        if (found != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
+//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
+vector<string> select_not_motif(const string& bibli, const string& benchmark) {
+    vector<string> selection;
+    vector<data> association = find_id(bibli, benchmark);
+
+    for (data d : association) {
+        selection.push_back(d.id);
+    }
+
+    for (data d : association) {
+        for (data d2 : association) {
+            string seq = d.seq_pdb;
+            string seq2 = d2.cmp;
+            bool test = false;
+
+            if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
+                test = does_it_match(seq, seq2);
+                if (test) {
+                    cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
+                    auto position = find(selection.begin(), selection.end(), d.id);
+                    if (position != selection.end()) {
+                        int index = position - selection.begin();
+                        selection.erase(selection.begin() + index);
+                    }
+                }
+            }
+        }
+    }
+    sort(selection.begin(), selection.end() );
+    selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
+
+    cout << "size: " << selection.size() << endl;
+
+    return selection;
+}
+
+int main()
+{
+    string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
+    string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
+
+    /*vector<data> v = get_list_pdb_benchmark(benchmark);
+    for (data d : v) {
+        cout << d.pdb << ", " << d.seq_pdb << endl;
+    }*/
+
+    /*string name = "1U6P_B";
+    data d = find_id_pattern(name, benchmark);
+    cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
+
+    /*vector<data> association = find_id(bibli, benchmark);
+    for (data d : association) {
+        cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
+    }*/
+
+    /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
+    string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
+    bool test = does_it_match(seq, seq_motif);
+    cout << test << endl;*/
+
+    vector<string> selection = select_not_motif(bibli, benchmark);
+    for (string str : selection) {
+        cout << str << ", ";
+    }
+    cout << endl;
+
+    return 0;
+}
\ No newline at end of file
--- a/scripts/stats.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/stats.py 0 → 100644
View file @3b2bbb9
--- a/scripts/temp/test.fa 0 → 100644
View file @3b2bbb9
+++ b/scripts/temp/test.fa 0 → 100644
View file @3b2bbb9
+>test
+CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG