Merge branch 'stage_NBernard' into 'master'

Stage n bernard results See merge request !1

Merge branch 'stage_NBernard' into 'master'
Stage n bernard results See merge request !1
Louis BECQUEY
Commit 3b2bbb9956c579b13e4aeec1d96d490b97fbdd82 3b2bbb99 2 parents acf04330 97d4bf4f
Showing 75 changed files with 805 additions and 241 deletions
.gitignore
INSTALL.md
Isaure_benchmark.py
Makefile
Readme.md
benchmark.py
biorseo.py
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/Motif.cpp
cppsrc/Motif.h
cppsrc/Scripts/count_pattern.cpp
cppsrc/Scripts/delete_same_pdb.cpp
cppsrc/SecondaryStructure.cpp
cppsrc/SecondaryStructure.h
cppsrc/biorseo.cpp
cppsrc/program
cppsrc/rna.cpp
cppsrc/rna.h
data/fasta/applications.fa
--- a/.gitignore
View file @3b2bbb9
+++ b/.gitignore
View file @3b2bbb9
- .vscode/*
 .vscode
 
- # LaTeX temporary files
- doc/*.toc
- doc/*.bbl
- doc/*.gz
- doc/*.log
- doc/*.aux
- doc/*.blg
- doc/*.fls
- doc/*.fdb_latexmk
- 
 # Docker installation temporary files
 eigen-eigen-323c052e1731
 cplex_installer_12.8_Student.bin
@@ -20,7 +9,6 @@ ViennaRNA-2.4.13
 
 # Compiled Object files
 obj/*
- doc/*.pdf
 data/modules/RIN/__pycache__
 
 # Executables
@@ -44,4 +32,4 @@ data/modules/RIN
 data/modules/ISAURE
 data/sec_structs/bpRNA-1m_90.dbn
 data/sec_structs/pseudobase++.dbn
- 
+ data/fasta/contacts
--- a/INSTALL.md
View file @3b2bbb9
+++ b/INSTALL.md
View file @3b2bbb9
--- a/Isaure_benchmark.py 0 → 100644
View file @3b2bbb9
+++ b/Isaure_benchmark.py 0 → 100644
View file @3b2bbb9
--- a/Makefile 100644 → 100755
View file @3b2bbb9
+++ b/Makefile 100644 → 100755
View file @3b2bbb9
@@ -9,7 +9,7 @@ CC	   = g++
 CFLAGS   = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3
 CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes
 LINKER   = g++
- LDFLAGS  = -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
+ LDFLAGS  = -Wno-free-nonheap-object -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
 
 # change these to proper directories where each file should be
 SRCDIR   = cppsrc
@@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES)
 	$(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@
 	@echo -e "\033[00;32mCompiled "$<".\033[00m"
 
- doc: mainpdf supppdf
- 	@echo -e "\033[00;32mLaTeX documentation rendered.\033[00m"
- 
- mainpdf: doc/main_bioinformatics.tex doc/references.bib doc/bioinfo.cls doc/natbib.bst
- 	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
- 	cd doc; bibtex main_bioinformatics
- 	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
- 	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
- 
- supppdf: doc/supplementary_material.tex
- 	cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error supplementary_material
- 
 .PHONY: all
- all: $(BINDIR)/$(TARGET) doc
+ all: $(BINDIR)/$(TARGET)
 
 .PHONY: re
 re: remove clean all
--- a/Readme.md
View file @3b2bbb9
+++ b/Readme.md
View file @3b2bbb9
@@ -19,6 +19,7 @@ THEN
 OUTPUT:
 - A set of secondary structures from the Pareto front,
 - The list of known modules inserted inplace in the corresponding structures
+ - A set of positions of the nucleotides in contact with the protein represented by asterisks (only if the motifs_28-05-2021.json library is used!)
 
 2/ The different models
 ==================================
@@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet):
 * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*)
 * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/)
 * CaRNAval 1.0 (*Reinhartz et al, 2018*)
- * RNA-Bricks 2, RNAMC, CaRNAval 2.0, and others could theoretically be used, but are not supported (yet). You might write your own API.
+ * /data/modules/ISAURE/motifs_28-05-2021.json a library of motifs from RNA linked to a protein from Isaure Chauvot de Beauchêne of LORIA laboratory
+  (contact:isaure.chauvot-de-beauchene@loria.fr)
 
 PATTERN MATCHING STEP
 - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards.
@@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA
 * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size.
 * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score).
 * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size.
+ * **Function E** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides in the module.
+ * **Function F** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides along the entire length of the RNA.
 
 3/ Installation
 ==================================
@@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions.
 
 - If you **might expect a pseudoknot, or don't know**:
     * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example:
-     `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A`
+     `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A --MEA`
     
     * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example:
-     `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A
+     `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A --MEA
 
 5/ List of Options
 ==================================
 ```
 Usage:  You must provide:
         1) a FASTA input file with -i,
-         2) a module type with --rna3dmotifs, --carnaval or --3dmotifatlas
+         2) a module type with --rna3dmotifs, --carnaval, --3dmotifatlas or --contacts
         3) one module placement method in { --patternmatch, --jar3d, --bayespairing }
-         4) one scoring function with --func A, B, C or D
- 
+         4) one scoring function with --func A, B, C, D, E ou F
+ 	5) one estimator betwenn --MEA or --MFE
         If you are not using the Docker image: 
-         5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
+         6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
 
 Options:
 -h [ --help ]                   Print this help message
@@ -79,16 +83,21 @@ Options:
 --rna3dmotifs                   Use DESC modules from Djelloul & Denise, 2008
 --carnaval                      Use RIN modules from Reinharz & al, 2018
 --3dmotifatlas                  Use the HL and IL loops from BGSU's 3D Motif Atlas (updated)
+ --contacts			Use the library of motifs, created from RNA sequences linked to proteins provided by I. Chauvot de Beauchene of LORIA laboratory
 -p [ --patternmatch ]           Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval)
 -j [ --jar3d ]                  Use JAR3D to place modules in the sequence (requires --3dmotifatlas)
 -b [ --bayespairing ]           Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas)
 -o [ --output=… ]               File to summarize the results
 -O [ --outputf=… ]              Folder where to output result and temp files
- -f [ --func=… ]                 (A, B, C or D, default is B) Objective function to score module insertions:
+ -f [ --func=… ]                 (A, B, C, D, E or F default is B) Objective function to score module insertions:
                                   (A) insert big modules (B) insert light, high-order modules
-                                   (c) insert modules which score well with the sequence
+                                   (C) insert modules which score well with the sequence
                                   (D) insert light, high-order modules which score well with the sequence.
-                                   C and D require cannot be used with --patternmatch.
+                                   C and D cannot be used with --patternmatch.
+ 				  (E) and (F) insert modules with a lot of nucleotides and a lot of nucleotides in contact with a proteine, and a huge number of occurences. 
+ 				  (E) maximize the number of contact nucleotide inside the module, while (F) maximize the number of contact nucleotide along the entire length of the RNA.
+ --MEA				Use Maximum Expected Accuracy for the second objective
+ --MFE				Use Minimum Free Energy based on the formula of (*Legendre et al., 2018*) for the second objective
 -c [ --first-objective=… ]      (default 1) Objective to solve in the mono-objective portions of the algorithm.
                                   (1) is the module objective given by --func, (2) is the expected accuracy of the structure.
 -l [ --limit=… ]                (default 500) Number of solutions in the Pareto set from which
@@ -113,9 +122,9 @@ Options:
                                   BiORSEO from outside the docker image. Use the FULL path.
 
 Examples:
- biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B
- biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800
- biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D
+ biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA
+ biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 --MEA
+ biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA
 
 The allowed module/placement-method/function combinations are:
 
@@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are:
 --rna3dmotifs     A. B.           A. B. C. D.
 --3dmotifatlas                    A. B. C. D.     A. B. C. D.
 --carnaval        A. B.
+ --contacts 	  E. F.
 
 ```
--- a/benchmark.py
View file @3b2bbb9
+++ b/benchmark.py
View file @3b2bbb9
@@ -29,11 +29,11 @@ import pickle
 # ================== DEFINITION OF THE PATHS ==============================
 
 biorseoDir = path.realpath(".")
- jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
+ jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
- moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
- biokopdir = "/home/persalteas/Software/biokop/biokop"
+ moipdir = "/local/local/localopt/RNAMoIP/Src/RNAMoIP.py"
+ biokopdir = "/local/local/localopt/biokop/biokop"
 runDir = path.dirname(path.realpath(__file__))
 bpRNAFile = argv[1]
 PseudobaseFile = argv[2]
@@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3):
 			if not '(' in struct:
 				continue # ignore linear structures
 			if is_canonical_nts(seq) and is_canonical_bps(struct):
+ 				# keeps what's inside brackets at the end as the filename
 				if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct))
+ 				# keeps what's inside square brackets at the end as the filename
 				if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct))
+ 				# keeps all the header as filename
 				if header_style == 3: container.append(RNA(header[1:], header, seq, struct))
 				if '[' in struct: counter += 1
 	db.close()
@@ -1475,8 +1478,8 @@ def print_StudyCase_results():
 if __name__ == '__main__':
 
 	print("> Loading files...", flush=True)
- 	bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile)
- 	PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile)
+ 	bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile, header_style=1)
+ 	PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile, header_style=3)
 	StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1)
 
 	for nt, number in ignored_nt_dict.items():
--- a/biorseo.py deleted 100755 → 0
View file @acf0433
+++ b/biorseo.py deleted 100755 → 0
View file @acf0433
--- a/cppsrc/MOIP.cpp
View file @3b2bbb9
+++ b/cppsrc/MOIP.cpp
View file @3b2bbb9
--- a/cppsrc/MOIP.h
View file @3b2bbb9
+++ b/cppsrc/MOIP.h
View file @3b2bbb9
@@ -37,6 +37,7 @@ class MOIP
 	void                      	forbid_solutions_between(double min, double max);
 	IloEnv&                   	get_env(void);
 	static char               	obj_function_nbr_;    // On what criteria do you want to insert motifs ?
+ 	static char					obj_function2_nbr_;  // Do you want to use MEA or MFE to determine the best energy score ?
 	static uint               	obj_to_solve_;  // What objective do you prefer to solve in mono-objective portions of the algorithm ?
 	static double             	precision_;   // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
 	static bool               	allow_pk_;      // Wether we forbid pseudoknots (false) or allow them (true)
@@ -47,8 +48,12 @@ class MOIP
 	void   						define_problem_constraints(string& source);
 	size_t 						get_yuv_index(size_t u, size_t v) const;
 	size_t 						get_Cpxi_index(size_t x_i, size_t i_on_j) const;
+ 	size_t 						get_xij_index(size_t u, size_t v) const;
+ 
 	IloNumExprArg& 				y(size_t u, size_t v);    // Direct reference to y^u_v in basepair_dv_
 	IloNumExprArg& 				C(size_t x, size_t i);    // Direct reference to C_p^xi in insertion_dv_
+ 	IloNumExprArg& 				x(size_t u, size_t v);    // Direct reference to x_i,j in stacks_dv_
+ 
 	bool   						exists_vertical_outdated_labels(const SecondaryStructure& s) const;
 	bool   						exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
 	void   						allowed_motifs_from_desc(args_of_parallel_func arg_struct);
@@ -66,12 +71,16 @@ class MOIP
 	IloEnv                 env_;                         // environment CPLEX object
 	IloNumVarArray         basepair_dv_;                 // Decision variables
 	IloNumVarArray         insertion_dv_;                // Decision variables
+ 	IloNumVarArray         stacks_dv_;                    // Decision variables
+ 
 	IloModel               model_;                       // Solver for objective 1
 	IloExpr                obj1;                         // Objective function that counts inserted motifs
 	IloExpr                obj2;                         // Objective function of expected accuracy
 	vector<vector<size_t>> index_of_Cxip_;               // Stores the indexes of the Cxip in insertion_dv_
 	vector<size_t>         index_of_first_components;    // Stores the indexes of Cx1p in insertion_dv_
 	vector<vector<size_t>> index_of_yuv_;                // Stores the indexes of the y^u_v in basepair_dv_
+ 
+ 	vector<vector<size_t>>   index_of_xij_;		         //Stores the indexes of the xij variables (BioKop) in stacks_dv_
 };
 
 inline uint                      MOIP::get_n_solutions(void) const { return pareto_.size(); }
@@ -79,6 +88,8 @@ inline uint                      MOIP::get_n_candidates(void) const { return ins
 inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; }
 inline IloNumExprArg&            MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; }
 inline IloNumExprArg&            MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; }
+ inline IloNumExprArg&            MOIP::x(size_t u, size_t v) { return stacks_dv_[get_xij_index(u, v)]; }
+ 
 inline SecondaryStructure        MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); }
 inline IloEnv&                   MOIP::get_env(void) { return env_; }
 
--- a/cppsrc/Motif.cpp
View file @3b2bbb9
+++ b/cppsrc/Motif.cpp
View file @3b2bbb9
--- a/cppsrc/Motif.h
View file @3b2bbb9
+++ b/cppsrc/Motif.h
View file @3b2bbb9
@@ -20,13 +20,7 @@ typedef struct Comp_ {
     pair<uint, uint> pos;
     size_t           k;
     string           seq_;
-     uint             nb_pairing;         
     Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
-     Comp_(pair<int, int> p, uint nb_pair) : pos(p) 
-     { 
-         k = 1 + pos.second - pos.first; 
-         nb_pairing = nb_pair;
-     }
     Comp_(uint start, uint length) : k(length)
     {
         pos.first  = start;
@@ -64,6 +58,7 @@ class Motif
     string            get_identifier(void) const;
     vector<Component> comp;
     vector<Link>      links_;
+     vector<uint>      pos_contacts;
 
     size_t            contact_;
     double            tx_occurrences_;
@@ -89,7 +84,19 @@ vector<Motif>               load_csv(const string& path);
 vector<Motif>               load_json_folder(const string& path, const string& rna, bool verbose);
 
 vector<vector<Component>>   find_next_ones_in(string rna, uint offset, vector<string>& vc);
- vector<vector<Component>>   json_find_next_ones_in(string rna, uint offset, vector<string>& vc, vector<string>& vs);
+ vector<vector<Component>>   json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
+ 
+ // utilities for Json motifs
+ size_t count_nucleotide(string&);
+ size_t count_delimiter(string&);
+ size_t count_contacts(string&);
+ string check_motif_sequence(string);
+ bool checkSecondaryStructure(string);
+ vector<Link> build_motif_pairs(string&, vector<Component>&);
+ uint find_max_occurrences(string&);
+ uint find_max_sequence(string&);
+ vector<string> find_components(string&, string);
+ vector<uint> find_contacts(vector<string>&, vector<Component>&);
 
 // utilities to compare secondary structures:
 bool operator==(const Motif& m1, const Motif& m2);
--- a/cppsrc/Scripts/count_pattern.cpp deleted 100644 → 0
View file @acf0433
+++ b/cppsrc/Scripts/count_pattern.cpp deleted 100644 → 0
View file @acf0433
--- a/cppsrc/Scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @acf0433
+++ b/cppsrc/Scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @acf0433
- #include <iostream>
- #include <sstream>
- #include <fstream>
- #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
- #include <typeinfo>
- #include <set>
- #include <algorithm>
- #include <cstdio>
- #include <vector>
- 
- using namespace std;
- using json = nlohmann::json;
- 
- void delete_redundant_pdb(const string& jsonfile, const string& jsontest, const string& jsonoutfile) {
-     std::ifstream lib(jsonfile);
-     std::ifstream lib2(jsontest);
-     
-     std::ofstream outfile (jsonoutfile);
-     json new_motif;
-     json new_id;
-     json js = json::parse(lib);
-     json js2 = json::parse(lib2);
-     
-     //the list of pfam lists of the motif we want to count the inclusion in other motif
-     for (auto it = js.begin(); it != js.end(); ++it) {
-         string id = it.key();
-         vector<string> list_pdbs;
-         vector<string> list_pdbs2;
-         bool is_added = true;
- 
-         //cout << "id: " << id << endl;
-         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-             string test = it2.key();   
- 
-             if (!test.compare("pdb")) {
-                 vector<string> tab = it2.value();
-                 list_pdbs = tab;
-                 /*set<set<string>>::iterator iit;
-                 set<string>::iterator iit2;
-                 for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                     for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                         cout << *iit2 << endl;
-                     }
-                     cout << endl << endl;
-                 }*/
-             } else {
-                 new_id[test] = it2.value();
-             }  
-         }
-         //cout << "-------begin---------" << endl;
-         
-         for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
-             string id2 = it3.key();
- 
-             //cout << "id: " << id << " / id2: " << id2 << endl;
-             for (auto it4 = js[id2].begin(); it4 != js[id2].end(); ++it4) {
-                 string test = it4.key();
-                 
-                 if (!test.compare("pdb")) {
-                     vector<string> tab = it4.value();
-                     list_pdbs2 = tab;
- 
-                     //cout << id << " / " << id2 << endl;
-                     for (uint k = 0; k < list_pdbs2.size(); k++) {
-                         if (count(list_pdbs.begin(), list_pdbs.end(), list_pdbs2[k])) {
-                             is_added = false;
-                         }
-                         //cout << list_pdbs2[k] << endl;
-                     }
- 
-                 } 
-                 
-             }
-             //cout << endl;*/
-         }
-     
-        
-         /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
-             for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
-                 cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
-             }
-         }*/
-         if (is_added) {      
-             new_id["pdb"] = list_pdbs;     
-             new_motif[id] = new_id;
-         }
-         new_id.clear();
-         //cout << "valeur: " << ite << endl;
-         /*for (uint i = 0; i < tab_struc.size() ; i++) {
-         cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
-         } */
-     }
-     outfile << new_motif.dump(4) << endl;
-     outfile.close(); 
- }
- 
- int main()
- {
-     string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/bibli_test2.json";
-     string jsontest = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark_test.json";
-     string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final_test.json";
-     delete_redundant_pdb(jsonfile, jsontest, out);
-     return 0;
- }
-     
--- a/cppsrc/SecondaryStructure.cpp
View file @3b2bbb9
+++ b/cppsrc/SecondaryStructure.cpp
View file @3b2bbb9
@@ -3,11 +3,13 @@
 #include <algorithm>
 #include <boost/format.hpp>
 
+ #define RESET   "\033[0m"
+ #define RED     "\033[31m"      /* Red */
+ 
 using std::abs;
 using std::cout;
 using std::endl;
 
- 
 SecondaryStructure::SecondaryStructure() {}
 
 
@@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const
     return res;
 }
 
+ string structure_with_contacts(const SecondaryStructure& ss) {
+     string sequence = ss.rna_.get_seq();
+     string construct = "";
+     bool flag;
+     for (uint i = 0; i < sequence.size(); i++) {
+         flag = false;
+         for (const Motif& m : ss.motif_info_) {
+             for (uint j = 0; j < m.pos_contacts.size(); j++) {
+                 if (m.pos_contacts[j] == i) flag = true;
+             }
+         }
+         if (flag) {   
+             construct += "*";
+         } else {
+             construct += ".";
+         }
+     }
+     return construct;
+ }
+ 
 string SecondaryStructure::to_string(void) const
 {
     string s;
@@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j)
 
 void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
 
- 
+ void colored_contacts(string sequence, vector<Motif> motif_info_) {
+     bool flag;
+     for (uint i = 0; i < sequence.size(); i++) {
+         flag = false;
+         for (const Motif& m : motif_info_) {
+             for (uint j = 0; j < m.pos_contacts.size(); j++) {
+                 if (m.pos_contacts[j] == i) flag = true;
+             }
+         }
+         if (flag) {   
+             cout << RED << sequence[i] << RESET;
+         } else {
+             cout << sequence[i];
+         }
+     }
+ }
 
 void SecondaryStructure::print(void) const
 {
     cout << endl;
-     cout << '\t' << rna_.get_seq() << endl;
-     cout << '\t' << to_string() << endl;
+     cout << '\t';
+     colored_contacts(rna_.get_seq(), motif_info_);
+     //rna_.get_seq() 
+     cout << endl;
+     string ss = to_string();
+     cout << '\t';
+     colored_contacts(ss, motif_info_);
+     //cout << ss;
+     cout << endl;
     for (const Motif& m : motif_info_) {
         uint i = 0;
         cout << '\t';
--- a/cppsrc/SecondaryStructure.h
View file @3b2bbb9
+++ b/cppsrc/SecondaryStructure.h
View file @3b2bbb9
@@ -30,7 +30,6 @@ class SecondaryStructure
     string to_DBN() const;
     string to_string() const;
 
- 
     vector<double> objective_scores_;       // values of the different objective functions for that SecondaryStructure
     vector<pair<uint, uint>> basepairs_;    // values of the decision variable of the integer program
     vector<Motif> motif_info_;    // information about known motives in this secondary structure and their positions
@@ -58,5 +57,7 @@ inline void   SecondaryStructure::set_objective_score(int i, double s) { objecti
 inline uint   SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
 inline uint   SecondaryStructure::get_n_bp(void) const { return nBP_; }
 
+ string structure_with_contacts(const SecondaryStructure& ss);
+ 
 
 #endif    //  SECONDARY_STRUCTURE_
\ No newline at end of file
--- a/cppsrc/biorseo.cpp
View file @3b2bbb9
+++ b/cppsrc/biorseo.cpp
View file @3b2bbb9
--- a/cppsrc/program 0 → 100644
View file @3b2bbb9
+++ b/cppsrc/program 0 → 100644
View file @3b2bbb9
--- a/cppsrc/rna.cpp
View file @3b2bbb9
+++ b/cppsrc/rna.cpp
View file @3b2bbb9
@@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose)
 			pij_(results->i-1,results->j-1) = results->p;
 			results++;
 		}
+ 
+ 		/*define type_*/
+ 		type_ = vector<vector<int>>(n_, vector<int>(n_));
+ 		for(uint i = 0; i < n_; i++){
+ 			for(uint j = 0; j < n_; j++){
+ 				if (i < j){
+ 					std::stringstream ss;
+ 					ss << seq_[i] << seq_[j];
+ 					std::string str = ss.str();
+ 					if(str.compare("AU") == 0 ){
+ 						type_[i][j] = 1;
+ 					}
+ 					else if(str.compare("CG") == 0 ){
+ 						type_[i][j] = 2;
+ 					
+ 					}
+ 					else if(str.compare("GC") == 0 ){
+ 						type_[i][j] = 3;
+ 					}
+ 					else if(str.compare("GU") == 0 ){
+ 						type_[i][j] = 4;
+ 					}
+ 					else if(str.compare("UG") == 0 ){
+ 						type_[i][j] = 5;
+ 					}
+ 					else if(str.compare("UA") == 0 ){
+ 						type_[i][j] = 6;
+ 					}
+ 					else{
+ 						type_[i][j] = 0;
+ 					}
+ 				}
+ 				else{
+ 					type_[i][j] = 0;
+ 				}
+ 			}
+ 		}
+ 
 	}
 
 	else cerr << "NULL result returned by vrna_pfl_fold" << endl;
 }
 
- 
 void RNA::print_basepair_p_matrix(float theta) const
 {
 	cout << endl;
--- a/cppsrc/rna.h
View file @3b2bbb9
+++ b/cppsrc/rna.h
View file @3b2bbb9
@@ -32,6 +32,8 @@ class RNA
     uint   get_RNA_length(void) const;
     void   print_basepair_p_matrix(float theta) const;
 
+     vector<vector<int>> get_type();
+ 
     bool verbose_;    // Should we print things ?
 
     private:
@@ -41,10 +43,15 @@ class RNA
     string   seq_;     // sequence of the rna with chars
     uint     n_;       // length of the rna
     MatrixXf pij_;     // matrix of basepair probabilities
+ 
+     vector<vector<int>> type_;  //vector of base pair types
 };
 
 inline float  RNA::get_pij(int i, int j) { return pij_(i, j); }
 inline uint   RNA::get_RNA_length() const { return n_; }
 inline string RNA::get_seq(void) const { return seq_; }
 
+ inline vector<vector<int>>  RNA::get_type() { return type_; }
+ 
+ 
 #endif
--- a/data/fasta/applications.fa 100644 → 100755
View file @3b2bbb9
+++ b/data/fasta/applications.fa 100644 → 100755
View file @3b2bbb9
- >__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376)
- GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
- >__'GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE_'_(PDB_01023)
- GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
- >__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857)
- GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
+ >test_CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE__PDB_00376
+ GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
+ >test_GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE__PDB_01023
+ GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
+ >test_SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA__PDB_00857
+ GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
\ No newline at end of file
--- a/data/fasta/example.fa 100644 → 100755
View file @3b2bbb9
+++ b/data/fasta/example.fa 100644 → 100755
View file @3b2bbb9
--- a/data/fasta/motif_test.fa deleted 100644 → 0
View file @acf0433
+++ b/data/fasta/motif_test.fa deleted 100644 → 0
View file @acf0433
- > JSON1000_extended
- AAUAUCCGGGCGUUUAAUCCCGGGAUAAA
\ No newline at end of file
--- a/data/modules/ISAURE/Readme.md 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/Readme.md 0 → 100644
View file @3b2bbb9
+ The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA 
+ laboratory. These motifs are made up of RNA fragments linked to proteins.
+ ==================================================================================================================
+ 
+ Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
+ The current scripts were created based on this file, and doesn't work with the other older libraries.
+ 
+ There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
+ It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one 
+ contains 130 RNA.
+ 
+ The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'. 
+ They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
+ 
+ The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
+ the 'motifs_06-06-2021.json' motifs file.
+ This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A 
+ is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
+ 
+ The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
+ the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
+ 
+ 
+ 
+ 
+ 
+ 
--- a/data/modules/ISAURE/benchmark.dbn 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.dbn 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark.txt 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark.txt 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark_16-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark_16-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/benchmark_16-07-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/benchmark_16-07-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_01-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_01-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_06-06-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_06-06-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_28-05-2021.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_28-05-2021.json 0 → 100644
View file @3b2bbb9
--- a/data/modules/ISAURE/motifs_final.json 0 → 100644
View file @3b2bbb9
+++ b/data/modules/ISAURE/motifs_final.json 0 → 100644
View file @3b2bbb9
--- a/data/sec_structs/RNAMoIP_dataset.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/RNAMoIP_dataset.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/applications.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/applications.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/bpRNA-1m_90_short.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/bpRNA-1m_90_short.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/example.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/example.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/nothing.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/nothing.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/pseudobase++_short.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/pseudobase++_short.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/pseudoknots.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/pseudoknots.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
--- a/data/sec_structs/verified_secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
+++ b/data/sec_structs/verified_secondary_structures_database.dbn 100644 → 100755
View file @3b2bbb9
--- a/doc/Benchmark_unconstrained.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Benchmark_unconstrained.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/Nmotifs.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Nmotifs.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/Nsol.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/Nsol.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/OUP_First_SBk_Bot_8401.eps deleted 100644 → 0
View file @acf0433
+++ b/doc/OUP_First_SBk_Bot_8401.eps deleted 100644 → 0
View file @acf0433
--- a/doc/algorithm2e.sty deleted 100644 → 0
View file @acf0433
+++ b/doc/algorithm2e.sty deleted 100644 → 0
View file @acf0433
--- a/doc/bioinfo.cls deleted 100644 → 0
View file @acf0433
+++ b/doc/bioinfo.cls deleted 100644 → 0
View file @acf0433
--- a/doc/fig/Benchmark_avg.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/Benchmark_avg.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/fig/MOIP_subopt.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/MOIP_subopt.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/fig/RNA_SSE.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/RNA_SSE.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_B.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_B.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_C.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_C.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/kernels_D.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/kernels_D.png deleted 100644 → 0
View file @acf0433
--- a/doc/fig/pseudoknots.png deleted 100644 → 0
View file @acf0433
+++ b/doc/fig/pseudoknots.png deleted 100644 → 0
View file @acf0433
--- a/doc/graph_abstract.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/graph_abstract.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/kernels_A.jpg deleted 100644 → 0
View file @acf0433
+++ b/doc/kernels_A.jpg deleted 100644 → 0
View file @acf0433
--- a/doc/main_bioinformatics.tex deleted 100644 → 0
View file @acf0433
+++ b/doc/main_bioinformatics.tex deleted 100644 → 0
View file @acf0433
--- a/doc/supplementary_materials.tex deleted 100644 → 0
View file @acf0433
+++ b/doc/supplementary_materials.tex deleted 100644 → 0
View file @acf0433
--- a/scripts/Compare_energy_results.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/Compare_energy_results.py 0 → 100644
View file @3b2bbb9
+ from math import sqrt, ceil
+ import numpy as np
+ import matplotlib.pyplot as plt
+ import re
+ import seaborn as sns
+ import pandas as pd
+ import matplotlib.pylab as plt
+ 
+ # Retrieve for each rna the best value for MEA and compare this energy value with the one obtains with
+ # RNAeval and RNAfold from the ViennaRNA Package 2.0 (Ronny Lorentz et al., 2011)
+ # After getting those values, it will creates a figure.
+ def get_result_MEA(filename):
+     ext = "json_pmE"
+     file2 = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/" + filename + ext, "r")
+ 
+     name = file2.readline()
+     rna = file2.readline()
+     twod = file2.readline()
+     pred = re.findall(r'\S+', twod)
+ 
+     score = '-' + pred[len(pred)-1]
+     min = float(score)
+     contacts = file2.readline()
+     while twod:
+         twod = file2.readline()
+         pred = re.findall(r'\S+', twod)
+         if len(pred) > 0:
+             score = '-' + pred[len(pred) - 1]
+             if float(score) < min:
+                 min = float(score)
+         contacts = file2.readline()
+     file2.close()
+     return min
+ 
+ fileMFE = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAfold_bm.log", "r")
+ lineRna = fileMFE.readline()
+ lineStruct = fileMFE.readline()
+ 
+ fileEval = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAeval_bm.log", "r")
+ lineRna2 = fileEval.readline()
+ lineStruct2 = fileEval.readline()
+ 
+ file = open("/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn", "r")
+ name = file.readline().strip()
+ rna = file.readline()
+ twod = file.readline()
+ contacts = file.readline()
+ list_name = []
+ list_score = []
+ list_type = []
+ print(np)
+ while name:
+     #print(name)
+     if lineRna != rna:
+         while lineRna != rna:
+             lineRna = fileMFE.readline()
+             lineStruct = fileMFE.readline()
+     MFE = float(lineStruct[len(lineStruct)-8:len(lineStruct)-2])
+     list_name.append(name[5:len(name)-1])
+     list_score.append(MFE)
+     list_type.append('MFE')
+     #print("MFE:" + str(MFE))
+     lineRna = fileMFE.readline()
+     lineStruct = fileMFE.readline()
+ 
+     if lineRna2 != rna:
+         while lineRna2 != rna:
+             lineRna2 = fileEval.readline()
+             lineStruct2 = fileEval.readline()
+     eval = float(lineStruct2[len(lineStruct2)-8:len(lineStruct2)-2])
+     list_name.append(name[5:len(name) - 1])
+     list_score.append(eval)
+     list_type.append('eval')
+     #print("Eval:" + str(eval))
+     lineRna2 = fileEval.readline()
+     lineStruct2 = fileEval.readline()
+ 
+     best_mea = get_result_MEA(name)
+     #print("MEA: " + str(best_mea) + "\n")
+     list_name.append(name[5:len(name) - 1])
+     list_score.append(best_mea)
+     list_type.append('MEA')
+     name = file.readline().strip()
+     rna = file.readline()
+     twod = file.readline()
+     contacts = file.readline()
+ 
+ file.close()
+ fileMFE.close()
+ fileEval.close()
+ 
+ '''print(list_MFE)
+ print(list_MEA)
+ print(list_eval)'''
+ 
+ #np = [["rna", "type_score", "score"]]
+ d = {'rna':list_name,'score':list_score, 'type_score':list_type}
+ df = pd.DataFrame(d, columns=['rna','type_score','score'])
+ 
+ sns.stripplot(x="rna",y="score",data=df,jitter=True,hue='type_score',palette='Set1')
+ plt.xticks(rotation=90)
+ plt.savefig("compare_BiORSEOMEA_RNAeval_RNAfold.png")
+ 
+ 
--- a/scripts/add_delimiter.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/add_delimiter.cpp 0 → 100644
View file @3b2bbb9
+ #include <iostream>
+ #include <sstream>
+ #include <fstream>
+ #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+ #include <typeinfo>
+ #include <set>
+ #include <algorithm>
+ #include <cstdio>
+ #include <vector>
+ 
+ using namespace std;
+ using json = nlohmann::json;
+ 
+ //Count the number of '&' in the motif sequence
+ size_t count_delimiter(string& seq) {
+     size_t count = 0;
+     for(uint i = 0; i < seq.size(); i++) {
+         char c = seq.at(i);
+         if (c == '&') {
+             count++;
+         }
+     }
+     return count;
+ }
+ 
+ /*
+ If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts', 
+ th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
+ */
+ void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
+     std::ifstream lib(jsonfile);
+     
+     std::ofstream outfile (jsonoutfile);
+     json new_motif;
+     json new_id;
+ 
+     json js = json::parse(lib);
+     
+     //the list of pfam lists of the motif we want to count the inclusion in other motif
+     for (auto it = js.begin(); it != js.end(); ++it) {
+         string id = it.key();
+         string test;
+         string sequence;
+         string contacts;
+         bool is_change = false;
+ 
+         //cout << "id: " << id << endl;
+         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+             test = it2.key();
+ 
+             if (!test.compare("sequence")) {
+                 //cout << "sequence: " << it2.value() << endl;
+                 sequence = it2.value();
+                 new_id[test] = it2.value();
+   
+             } else if (!test.compare("contacts") ) {
+                 contacts = it2.value();
+             } else {
+                 new_id[test] = it2.value();
+             }  
+         }
+         string tmp = "";
+         if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
+             for (uint i = 0; i < sequence.size(); i++) {
+                 if (sequence.at(i) == '&') {
+                     tmp += "&";
+                 } else {
+                     tmp += contacts.at(i);
+                 }
+             }
+         } else {
+             tmp = contacts;
+         }
+         new_id["contacts"] = tmp;
+         new_motif[id] = new_id;
+         new_id.clear();
+     }
+     outfile << new_motif.dump(4) << endl;
+     outfile.close();
+     
+ }
+ 
+ int main()
+ {
+     string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
+     string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
+     add_delimiter(jsonfile, out);
+     return 0;
+ }
+     
--- a/scripts/benchmark.py
View file @3b2bbb9
+++ b/scripts/benchmark.py
View file @3b2bbb9
@@ -29,7 +29,7 @@ import pickle
 # ================== DEFINITION OF THE PATHS ==============================
 
 biorseoDir = path.realpath(".")
- jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
+ jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
 moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
@@ -803,7 +803,7 @@ class Method:
 			else:
 				results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}"
 				c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"]
- 			c += ["-o", results_file, "--func", self.func]
+ 			c += ["-o", results_file, "--func", self.func, "--MFE"]
 			if not self.allow_pk:
 				c += ["-n"]
 			self.joblist.append(Job(command=c, priority=4, timeout=3600, 
--- a/cppsrc/Comptage des occurences/count_pattern.cpp → scripts/count_pattern.cpp
View file @3b2bbb9
+++ b/cppsrc/Comptage des occurences/count_pattern.cpp → scripts/count_pattern.cpp
View file @3b2bbb9
@@ -11,6 +11,12 @@
 using namespace std;
 using json = nlohmann::json;
 
+ /*
+ This script count the number of "occurrences" of the motif.
+ So we consider that if the sequence of pattern A is included in pattern B,
+ then for each inclusion of B we also have an inclusion of A. And vice versa.
+ */
+ 
 //Return true if the first sequence seq1 is included in the second sequence seq2
 //if not return false
 int is_contains(string& seq1, string& seq2) {
@@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) {
 
 //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B,
 //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list.
+ //The pattern A is counted only once in every other pattern, i.e. even if the sequence of A is found several times in B,
+ // it will be added only once in the occurrences of A.
 void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
     std::ifstream lib(jsonfile);
     std::ifstream lib2(jsonfile);
@@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
             if (!test.compare("pfam")) {
                 vector<vector<string>> tab = it2.value();
                 list_pfams = tab;
-                 /*set<set<string>>::iterator iit;
-                 set<string>::iterator iit2;
-                 for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                     for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                         cout << *iit2 << endl;
-                     }
-                     cout << endl << endl;
-                 }*/
             } else if (!test.compare("sequence")) {
                 //cout << "sequence: " << it2.value() << endl;
                 sequence = it2.value();
@@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                 new_id[test] = it2.value();
             }  
         }
-         //cout << "-------begin---------" << endl;
         
         for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
             string id2 = it3.key();
@@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                     if (!test.compare("pfam")) {
                         vector<vector<string>> tab = it4.value();
                         list_pfams2 = tab;
-                         /*for (uint k = 0; k < tab2.size(); k++) {
-                             for (uint l = 0; l < tab2[k].size(); l++) {
-                                 pfams2.insert(tab2[k][l]);
-                             }
-                             list_pfams2.insert(pfams);
-                             pfams2.clear();
-                         }*/
-             
-                         /*set<set<string>>::iterator iit;
-                         set<string>::iterator iit2;
-                         for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
-                             for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
-                                 cout << *iit2 << endl;
-                             }
-                             cout << endl << endl;
-                         }*/
                     } else if (!test.compare("occurences")) {
                         occurences2 = it4.value();
                         //cout << "occurences2: "<< occurences2 << endl;
@@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
                                                 
                                         }    
                                     }
-                                     //cout << "----end----" << endl;
                                 //}
                             }
                             if(flag) {
@@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
             //cout << endl;*/
         }
     
-        
-         /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
-             for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
-                 cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
-             }
-         }*/
         
         new_id["occurences"] = list_pfams.size();
-         new_id["pfam"] = list_pfams;
-                         
-         //cout << "-------ending---------" << endl;
+         new_id["pfam"] = list_pfams;        
         new_motif[id] = new_id;
         new_id.clear();
-         //cout << "valeur: " << ite << endl;
-         /*for (uint i = 0; i < tab_struc.size() ; i++) {
-         cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
-         } */
+ 
     }
     outfile << new_motif.dump(4) << endl;
     outfile.close();
@@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
 
 int main()
 {
-     //183
-     //cout << "------------------BEGIN-----------------" << endl;
-     string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_06-06-2021.json";
-     string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
+ 
+     string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
+     string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
     counting_occurences(jsonfile, out);
 
-     //cout << "------------------END-----------------" << endl;
     return 0;
 }
     
--- a/scripts/create_files.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/create_files.cpp 0 → 100644
View file @3b2bbb9
+ #include <iostream>
+ #include <sstream>
+ #include <fstream>
+ #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+ #include <typeinfo>
+ #include <set>
+ #include <algorithm>
+ #include <cstdio>
+ #include <vector>
+ 
+ using namespace std;
+ using json = nlohmann::json;
+ 
+ /*
+ Create a .fasta file for each of the sequence inside the benchmark in json format.
+ Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
+ Those files are useful for the Isaure_benchmark.py script.
+ */
+ void create_files(const string& jsonmotifs) {
+     std::ifstream lib(jsonmotifs);
+     string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
+     string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
+     string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
+     std::ofstream outlist (list);
+     std::ofstream outdbn (dbn);
+     json js = json::parse(lib);
+     uint count = 0;
+ 
+     for (auto it = js.begin(); it != js.end(); ++it) {    
+         string id = it.key();
+         string name, seq, contacts, structure;
+         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+             string chain = it2.key();
+             if (chain.compare("pfams") != 0) {
+                 string name = id + "_" + chain;
+                 string filename = fasta + name + ".fa";
+                 std::ofstream outfasta (filename);
+                 outfasta << ">test_" << name << endl;
+                 for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {     
+                     string field = it3.key();
+                     if (!field.compare("sequence")) {
+                         seq = it3.value();
+                         outfasta << seq.substr(0,seq.size()) << endl;
+                         outfasta.close();
+ 
+                     } else if (!field.compare("contacts")) {
+                         contacts = it3.value();
+ 
+                     } else if (!field.compare("struct2d")) {
+                         structure = it3.value();
+                     }
+                 }
+                 if(seq.find('&') == string::npos) {
+                     outlist << ">test_" << name << endl;
+                     outdbn << "test_" << name << "." << endl;
+                     outlist << contacts << endl;
+                     outdbn << seq << endl;
+                     outdbn << structure << endl;
+                     outdbn << contacts << endl;
+                     outlist << seq << endl;
+                     outlist << structure << endl;      
+                     count++;       
+                 }
+             }
+         }
+     }
+     cout << count << " sequences en tout" << endl;
+     lib.close();
+     outlist.close();
+     outdbn.close();
+ }
+ 
+ int main()
+ {
+     string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
+     string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json"; 
+     create_files(jsonbm);
+ 
+     return 0;
+ }
+     
--- a/scripts/delete_same_pdb.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/delete_same_pdb.cpp 0 → 100644
View file @3b2bbb9
+ #include <iostream>
+ #include <sstream>
+ #include <fstream>
+ #include "/local/local/BiorseoNath/cppsrc/json.hpp"
+ #include <typeinfo>
+ #include <set>
+ #include <algorithm>
+ #include <cstdio>
+ #include <vector>
+ #include <string>
+ 
+ using namespace std;
+ using json = nlohmann::json;
+ 
+ /*
+ This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
+ with BiORSEO.
+ */
+ void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
+     std::ifstream lib(jsonlibrary);
+     
+     std::ofstream outfile (jsonoutfile);
+     json new_motif;
+     json new_id;
+     json js = json::parse(lib);
+     
+     for (auto it = js.begin(); it != js.end(); ++it) {
+         string id = it.key();
+         vector<string> list_pdbs;
+         bool is_added = true;
+ 
+         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
+             string field = it2.key();   
+ 
+             if (!field.compare("pdb")) {
+                 vector<string> tab = it2.value();
+                 list_pdbs = tab;
+             } else {
+                 new_id[field] = it2.value();
+             }  
+         }
+ 
+         if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
+             is_added = false;
+         }
+         if (is_added) {      
+             new_id["pdb"] = list_pdbs;     
+             new_motif[id] = new_id;
+         }
+         new_id.clear();
+     }
+     outfile << new_motif.dump(4) << endl;
+     outfile.close(); 
+ }
+ 
+ int main(int argc, char** argv)
+ {
+     string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
+     string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
+     string name = argv[1];
+     delete_redundant_pdb(jsonlibrary, name, out);
+     return 0;
+ }
+     
--- a/scripts/pareto_visualizer.py
View file @3b2bbb9
+++ b/scripts/pareto_visualizer.py
View file @3b2bbb9
@@ -28,17 +28,18 @@
 from math import sqrt
 import numpy as np
 import matplotlib.pyplot as plt
- from matplotlib import cm 
+ from matplotlib import cm
 import scipy.stats as st
 import sys
 import os
 import subprocess
 import getopt
 
+ 
 class SecStruct:
     def __init__(self, dot_bracket, obj1_value, obj2_value):
         self.dbn = dot_bracket
-         self.objectives = [ obj1_value, obj2_value ]
+         self.objectives = [obj1_value, obj2_value]
         self.basepair_list = self.get_basepairs()
         self.length = len(dot_bracket)
 
@@ -96,9 +97,9 @@ class SecStruct:
         tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
 
         # Compute MCC
-         if (tp+fp == 0):
+         if (tp + fp == 0):
             print("We have an issue : no positives detected ! (linear structure)")
-         return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
+         return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
 
 
 class Pareto:
@@ -106,16 +107,16 @@ class Pareto:
         self.predictions = list_of_structs
         self.true_structure = reference
         self.n_pred = len(list_of_structs)
-         self.max_obj1 = max([ s.objectives[0] for s in self.predictions ])
-         self.max_obj2 = max([ s.objectives[1] for s in self.predictions ])
+         self.max_obj1 = max([s.objectives[0] for s in self.predictions])
+         self.max_obj2 = max([s.objectives[1] for s in self.predictions])
         self.index_of_best = self.find_best_solution()
-         
+ 
     def find_best_solution(self):
         # returns the index of the solution of the Pareto set which is the closest
         # to the real 2D structure (the one with the max MCC)
         max_i = -1
         max_mcc = -1
-         for i,s in enumerate(self.predictions):
+         for i, s in enumerate(self.predictions):
             mcc = s.get_MCC_with(self.true_structure)
             if mcc > max_mcc:
                 max_mcc = mcc
@@ -125,15 +126,15 @@ class Pareto:
     def get_normalized_coords(self):
         # retrieves the objective values of the best solution and normlizes them
         coords = self.predictions[self.index_of_best].objectives
-         if self.max_obj1: # avoid divide by zero if all solutions are 0
-             x = coords[0]/self.max_obj1
+         if self.max_obj1:  # avoid divide by zero if all solutions are 0
+             x = coords[0] / self.max_obj1
         else:
             x = 0.5
-         if self.max_obj2: # avoid divide by zero if all solutions are 0
-             y = coords[1]/self.max_obj2
+         if self.max_obj2:  # avoid divide by zero if all solutions are 0
+             y = coords[1] / self.max_obj2
         else:
             y = 0.5
-         return ( x, y )
+         return (x, y)
 
 
 class RNA:
@@ -145,6 +146,8 @@ class RNA:
 
 
 ignored_nt_dict = {}
+ 
+ 
 def is_canonical_nts(seq):
     for c in seq[:-1]:
         if c not in "ACGU":
@@ -155,6 +158,7 @@ def is_canonical_nts(seq):
             return False
     return True
 
+ 
 def is_canonical_bps(struct):
     if "()" in struct:
         return False
@@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3):
     db.close()
     return container, pkcounter
 
+ 
 def parse_biokop(folder, basename, ext=".biok"):
     solutions = []
     err = 0
@@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"):
             err = 1
     return None, err
 
+ 
 def parse_biorseo(folder, basename, ext):
     solutions = []
     err = 0
@@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext):
             err = 1
     return None, err
 
+ 
 def prettify_biorseo(code):
     name = ""
     if "bgsu" in code:
@@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
         print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
     print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped))
 
-     x = np.array([ p[0] for p in points ])
-     y = np.array([ p[1] for p in points ])
+     x = np.array([p[0] for p in points])
+     y = np.array([p[1] for p in points])
     xmin, xmax = 0, 1
     ymin, ymax = 0, 1
     xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
@@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
     ax[pos].axvline(x=1, alpha=0.2, color='black')
     ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
     ax[pos].scatter(x, y, s=25, alpha=0.1)
-     ax[pos].set_xlim((-0.1,1.1))
-     ax[pos].set_ylim((-0.1,1.1))
-     ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer)-skipped)+" RNAs)", (0.08,0.15))
+     ax[pos].set_xlim((-0.1, 1.1))
+     ax[pos].set_ylim((-0.1, 1.1))
+     ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
+     ax[pos].annotate("(" + str(len(points)) + '/' + str(len(RNAcontainer)-skipped) + " RNAs)", (0.08, 0.15))
     ax[pos].set_xlabel(xlabel)
     ax[pos].set_ylabel(ylabel)
 
     if nsolutions:
-         ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
-         ax[pos+1].set_xlim((0,max(sizes)+2))
-         ax[pos+1].set_xticks(range(0, max(sizes), 10))
-         ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
-         ax[pos+1].set_xlabel("# solutions")
-         ax[pos+1].set_ylabel("# RNAs")
+         ax[pos + 1].hist(sizes, bins=range(0, max(sizes) + 1, 2), histtype='bar')
+         ax[pos + 1].set_xlim((0, max(sizes) + 2))
+         ax[pos + 1].set_xticks(range(0, max(sizes), 10))
+         ax[pos + 1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
+         ax[pos + 1].set_xlabel("# solutions")
+         ax[pos + 1].set_ylabel("# RNAs")
+ 
 
 if __name__ == "__main__":
     try:
--- a/scripts/pareto_visualizer_json.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json.py 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_1.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_1.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MEA_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MEA_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MFE_MEA_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MFE_MEA_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/pareto_visualizer_json_MFE_functionE.png 0 → 100644
View file @3b2bbb9
+++ b/scripts/pareto_visualizer_json_MFE_functionE.png 0 → 100644
View file @3b2bbb9
--- a/scripts/selecting_id.cpp 0 → 100644
View file @3b2bbb9
+++ b/scripts/selecting_id.cpp 0 → 100644
View file @3b2bbb9
+ #include <iostream>
+ #include <sstream>
+ #include <fstream>
+ #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
+ #include <typeinfo>
+ #include <set>
+ #include <algorithm>
+ #include <cstdio>
+ #include <vector>
+ 
+ using namespace std;
+ using json = nlohmann::json;
+ 
+ /*
+ That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
+ */
+ 
+ //To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
+ struct data { 
+     //the pdb code (in the name of the sequence)
+     string pdb;
+     //the complete sequence with this pdb code
+     string seq_pdb;
+     //the id of the motif corresponding to this pdb in the library
+     string id;
+     //the module sequence with the components of this motif with the above id
+     string cmp;
+ };
+ typedef struct data data;
+ 
+ //returns the list of pdb codes and the corresponding information from the benchmark file.
+ vector<data> get_list_pdb_benchmark(const string& benchmark) {
+ 
+     fstream bm(benchmark);
+     vector<data> list_pdb_seq;
+     if (bm.is_open()) {
+         string name;
+         string sequence;
+         string structure;
+         string contacts;
+ 
+         while (getline(bm, name)) {
+             data d;
+             int size = name.size();
+             name = name.substr(5,size-6); 
+             getline(bm, sequence);
+             d.pdb = name;
+             d.seq_pdb = sequence;
+             list_pdb_seq.push_back(d);
+ 
+             getline(bm, structure);
+             getline(bm, contacts);
+         }
+         bm.close();
+     }
+     return list_pdb_seq;
+ }
+ 
+ string trim(string str) {
+     int size = str.size();
+     str = str.substr(1, size-2);
+     return str;
+ }
+ 
+ //store the corresponding id and motif to the sequence from the benchmark file
+ data find_id_pattern(string& pdb_pattern, const string& benchmark) {
+     vector<data> l = get_list_pdb_benchmark(benchmark);
+     int size = l.size();
+ 
+     for (data d : l) {
+         string cmp = d.pdb;
+         cmp = cmp.substr(0, d.pdb.size()-2);
+         if (!cmp.compare(pdb_pattern)) {
+             return d;
+         }
+     }
+     return data();
+ }
+ 
+ //Create an array of data ('association'), which consists of each pdb of the benchmark file
+ // with the associated pattern from this sequence.
+ vector<data> find_id(const string& bibli, const string& benchmark) {
+     ifstream lib(bibli);
+     json js = json::parse(lib);
+ 
+     //nam seq_bm et id seq_id
+     vector<data> association;
+     
+     for (auto it = js.begin(); it != js.end(); ++it) {  
+         string id = it.key();
+         data d;
+ 
+         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { 
+             string field = it2.key();
+             string seq;
+             if (!field.compare("pdb")) {
+                 int n = js[id][field].size();
+                 for (int i = 0; i < n ; i++) {
+                     ostringstream stream;
+                     stream << js[id][field][i];
+                     string pdb = trim(stream.str());
+                     
+                     d = find_id_pattern(pdb, benchmark);
+                 }
+             }
+ 
+             if (!field.compare("sequence")) {
+                 seq = it2.value();
+ 
+                 if (!(d.pdb.empty())) {                    
+                     d.id = id;
+                     d.cmp = seq;
+                     association.push_back(d);
+                 }
+             }
+         }
+     }
+     lib.close();
+     cout << association.size() << endl;
+     return association;
+ }
+ 
+ //check if the motif is found matching with a complete sequence from a benchmark file.
+ bool does_it_match(const string& seq, const string& seq_motif) {
+     size_t found = seq_motif.find("&");
+     size_t size = seq_motif.size();
+     vector<string> list_cmp;
+     if (found != std::string::npos) {
+         int count = 1;
+         
+         string cmp = seq_motif.substr(0, found);
+         list_cmp.push_back(cmp);
+         while(found != std::string::npos) {
+             size_t begin = found;
+             found = seq_motif.find("&", found + 1);
+             cmp = seq_motif.substr(begin+1, found-begin-1);
+             list_cmp.push_back(cmp);
+             count++;
+         }
+ 
+         found = seq.find(list_cmp[0]);
+         int count2 = 1;
+         while((found != std::string::npos) && (count2 < count)) {
+             size_t begin = found;
+             found = seq.find(list_cmp[count2], found + 1);
+             count2++;
+         }
+ 
+         if(count == count2) {
+             return true;
+         }
+ 
+     } else {
+         found = seq.find(seq_motif);
+         if (found != std::string::npos) {
+             return true;
+         }
+     }
+     return false;
+ }
+ 
+ //return the list of motif id that didn't match with any other complete sequence than the one which it came from.
+ vector<string> select_not_motif(const string& bibli, const string& benchmark) {
+     vector<string> selection;
+     vector<data> association = find_id(bibli, benchmark);
+ 
+     for (data d : association) {
+         selection.push_back(d.id);
+     }
+ 
+     for (data d : association) {
+         for (data d2 : association) {
+             string seq = d.seq_pdb;
+             string seq2 = d2.cmp;
+             bool test = false;
+ 
+             if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
+                 test = does_it_match(seq, seq2);
+                 if (test) {
+                     cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
+                     auto position = find(selection.begin(), selection.end(), d.id);
+                     if (position != selection.end()) {
+                         int index = position - selection.begin();
+                         selection.erase(selection.begin() + index);
+                     }
+                 }
+             }
+         }
+     }
+     sort(selection.begin(), selection.end() );
+     selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
+ 
+     cout << "size: " << selection.size() << endl;
+ 
+     return selection;
+ }
+ 
+ int main()
+ {
+     string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
+     string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
+ 
+     /*vector<data> v = get_list_pdb_benchmark(benchmark);
+     for (data d : v) {
+         cout << d.pdb << ", " << d.seq_pdb << endl;
+     }*/
+ 
+     /*string name = "1U6P_B";
+     data d = find_id_pattern(name, benchmark);
+     cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
+ 
+     /*vector<data> association = find_id(bibli, benchmark);
+     for (data d : association) {
+         cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
+     }*/
+ 
+     /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
+     string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
+     bool test = does_it_match(seq, seq_motif);
+     cout << test << endl;*/
+ 
+     vector<string> selection = select_not_motif(bibli, benchmark);
+     for (string str : selection) {
+         cout << str << ", ";
+     }
+     cout << endl;
+ 
+     return 0;
+ }
\ No newline at end of file
--- a/scripts/stats.py 0 → 100644
View file @3b2bbb9
+++ b/scripts/stats.py 0 → 100644
View file @3b2bbb9
--- a/scripts/temp/test.fa 0 → 100644
View file @3b2bbb9
+++ b/scripts/temp/test.fa 0 → 100644
View file @3b2bbb9
+ >test
+ CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG