Louis BECQUEY

Merge branch 'stage_NBernard' into 'master'

Stage n bernard results



See merge request !1
Showing 75 changed files with 805 additions and 241 deletions
1 -.vscode/*
2 .vscode 1 .vscode
3 2
4 -# LaTeX temporary files
5 -doc/*.toc
6 -doc/*.bbl
7 -doc/*.gz
8 -doc/*.log
9 -doc/*.aux
10 -doc/*.blg
11 -doc/*.fls
12 -doc/*.fdb_latexmk
13 -
14 # Docker installation temporary files 3 # Docker installation temporary files
15 eigen-eigen-323c052e1731 4 eigen-eigen-323c052e1731
16 cplex_installer_12.8_Student.bin 5 cplex_installer_12.8_Student.bin
...@@ -20,7 +9,6 @@ ViennaRNA-2.4.13 ...@@ -20,7 +9,6 @@ ViennaRNA-2.4.13
20 9
21 # Compiled Object files 10 # Compiled Object files
22 obj/* 11 obj/*
23 -doc/*.pdf
24 data/modules/RIN/__pycache__ 12 data/modules/RIN/__pycache__
25 13
26 # Executables 14 # Executables
...@@ -44,4 +32,4 @@ data/modules/RIN ...@@ -44,4 +32,4 @@ data/modules/RIN
44 data/modules/ISAURE 32 data/modules/ISAURE
45 data/sec_structs/bpRNA-1m_90.dbn 33 data/sec_structs/bpRNA-1m_90.dbn
46 data/sec_structs/pseudobase++.dbn 34 data/sec_structs/pseudobase++.dbn
47 - 35 +data/fasta/contacts
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
...@@ -9,7 +9,7 @@ CC = g++ ...@@ -9,7 +9,7 @@ CC = g++
9 CFLAGS = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3 9 CFLAGS = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3
10 CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes 10 CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes
11 LINKER = g++ 11 LINKER = g++
12 -LDFLAGS = -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm 12 +LDFLAGS = -Wno-free-nonheap-object -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
13 13
14 # change these to proper directories where each file should be 14 # change these to proper directories where each file should be
15 SRCDIR = cppsrc 15 SRCDIR = cppsrc
...@@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES) ...@@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES)
31 $(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@ 31 $(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@
32 @echo -e "\033[00;32mCompiled "$<".\033[00m" 32 @echo -e "\033[00;32mCompiled "$<".\033[00m"
33 33
34 -doc: mainpdf supppdf
35 - @echo -e "\033[00;32mLaTeX documentation rendered.\033[00m"
36 -
37 -mainpdf: doc/main_bioinformatics.tex doc/references.bib doc/bioinfo.cls doc/natbib.bst
38 - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
39 - cd doc; bibtex main_bioinformatics
40 - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
41 - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
42 -
43 -supppdf: doc/supplementary_material.tex
44 - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error supplementary_material
45 -
46 .PHONY: all 34 .PHONY: all
47 -all: $(BINDIR)/$(TARGET) doc 35 +all: $(BINDIR)/$(TARGET)
48 36
49 .PHONY: re 37 .PHONY: re
50 re: remove clean all 38 re: remove clean all
......
...@@ -19,6 +19,7 @@ THEN ...@@ -19,6 +19,7 @@ THEN
19 OUTPUT: 19 OUTPUT:
20 - A set of secondary structures from the Pareto front, 20 - A set of secondary structures from the Pareto front,
21 - The list of known modules inserted inplace in the corresponding structures 21 - The list of known modules inserted inplace in the corresponding structures
22 +- A set of positions of the nucleotides in contact with the protein represented by asterisks (only if the motifs_28-05-2021.json library is used!)
22 23
23 2/ The different models 24 2/ The different models
24 ================================== 25 ==================================
...@@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet): ...@@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet):
28 * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*) 29 * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*)
29 * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/) 30 * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/)
30 * CaRNAval 1.0 (*Reinhartz et al, 2018*) 31 * CaRNAval 1.0 (*Reinhartz et al, 2018*)
31 -* RNA-Bricks 2, RNAMC, CaRNAval 2.0, and others could theoretically be used, but are not supported (yet). You might write your own API. 32 +* /data/modules/ISAURE/motifs_28-05-2021.json a library of motifs from RNA linked to a protein from Isaure Chauvot de Beauchêne of LORIA laboratory
33 + (contact:isaure.chauvot-de-beauchene@loria.fr)
32 34
33 PATTERN MATCHING STEP 35 PATTERN MATCHING STEP
34 - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards. 36 - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards.
...@@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA ...@@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA
43 * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size. 45 * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size.
44 * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score). 46 * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score).
45 * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size. 47 * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size.
48 +* **Function E** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides in the module.
49 +* **Function F** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides along the entire length of the RNA.
46 50
47 3/ Installation 51 3/ Installation
48 ================================== 52 ==================================
...@@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions. ...@@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions.
55 59
56 - If you **might expect a pseudoknot, or don't know**: 60 - If you **might expect a pseudoknot, or don't know**:
57 * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example: 61 * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example:
58 - `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A` 62 + `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A --MEA`
59 63
60 * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example: 64 * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example:
61 - `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A 65 + `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A --MEA
62 66
63 5/ List of Options 67 5/ List of Options
64 ================================== 68 ==================================
65 ``` 69 ```
66 Usage: You must provide: 70 Usage: You must provide:
67 1) a FASTA input file with -i, 71 1) a FASTA input file with -i,
68 - 2) a module type with --rna3dmotifs, --carnaval or --3dmotifatlas 72 + 2) a module type with --rna3dmotifs, --carnaval, --3dmotifatlas or --contacts
69 3) one module placement method in { --patternmatch, --jar3d, --bayespairing } 73 3) one module placement method in { --patternmatch, --jar3d, --bayespairing }
70 - 4) one scoring function with --func A, B, C or D 74 + 4) one scoring function with --func A, B, C, D, E ou F
71 - 75 + 5) one estimator betwenn --MEA or --MFE
72 If you are not using the Docker image: 76 If you are not using the Docker image:
73 - 5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir) 77 + 6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
74 78
75 Options: 79 Options:
76 -h [ --help ] Print this help message 80 -h [ --help ] Print this help message
...@@ -79,16 +83,21 @@ Options: ...@@ -79,16 +83,21 @@ Options:
79 --rna3dmotifs Use DESC modules from Djelloul & Denise, 2008 83 --rna3dmotifs Use DESC modules from Djelloul & Denise, 2008
80 --carnaval Use RIN modules from Reinharz & al, 2018 84 --carnaval Use RIN modules from Reinharz & al, 2018
81 --3dmotifatlas Use the HL and IL loops from BGSU's 3D Motif Atlas (updated) 85 --3dmotifatlas Use the HL and IL loops from BGSU's 3D Motif Atlas (updated)
86 +--contacts Use the library of motifs, created from RNA sequences linked to proteins provided by I. Chauvot de Beauchene of LORIA laboratory
82 -p [ --patternmatch ] Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval) 87 -p [ --patternmatch ] Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval)
83 -j [ --jar3d ] Use JAR3D to place modules in the sequence (requires --3dmotifatlas) 88 -j [ --jar3d ] Use JAR3D to place modules in the sequence (requires --3dmotifatlas)
84 -b [ --bayespairing ] Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas) 89 -b [ --bayespairing ] Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas)
85 -o [ --output=… ] File to summarize the results 90 -o [ --output=… ] File to summarize the results
86 -O [ --outputf=… ] Folder where to output result and temp files 91 -O [ --outputf=… ] Folder where to output result and temp files
87 --f [ --func=… ] (A, B, C or D, default is B) Objective function to score module insertions: 92 +-f [ --func=… ] (A, B, C, D, E or F default is B) Objective function to score module insertions:
88 (A) insert big modules (B) insert light, high-order modules 93 (A) insert big modules (B) insert light, high-order modules
89 - (c) insert modules which score well with the sequence 94 + (C) insert modules which score well with the sequence
90 (D) insert light, high-order modules which score well with the sequence. 95 (D) insert light, high-order modules which score well with the sequence.
91 - C and D require cannot be used with --patternmatch. 96 + C and D cannot be used with --patternmatch.
97 + (E) and (F) insert modules with a lot of nucleotides and a lot of nucleotides in contact with a proteine, and a huge number of occurences.
98 + (E) maximize the number of contact nucleotide inside the module, while (F) maximize the number of contact nucleotide along the entire length of the RNA.
99 +--MEA Use Maximum Expected Accuracy for the second objective
100 +--MFE Use Minimum Free Energy based on the formula of (*Legendre et al., 2018*) for the second objective
92 -c [ --first-objective=… ] (default 1) Objective to solve in the mono-objective portions of the algorithm. 101 -c [ --first-objective=… ] (default 1) Objective to solve in the mono-objective portions of the algorithm.
93 (1) is the module objective given by --func, (2) is the expected accuracy of the structure. 102 (1) is the module objective given by --func, (2) is the expected accuracy of the structure.
94 -l [ --limit=… ] (default 500) Number of solutions in the Pareto set from which 103 -l [ --limit=… ] (default 500) Number of solutions in the Pareto set from which
...@@ -113,9 +122,9 @@ Options: ...@@ -113,9 +122,9 @@ Options:
113 BiORSEO from outside the docker image. Use the FULL path. 122 BiORSEO from outside the docker image. Use the FULL path.
114 123
115 Examples: 124 Examples:
116 -biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B 125 +biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA
117 -biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 126 +biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 --MEA
118 -biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D 127 +biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA
119 128
120 The allowed module/placement-method/function combinations are: 129 The allowed module/placement-method/function combinations are:
121 130
...@@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are: ...@@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are:
123 --rna3dmotifs A. B. A. B. C. D. 132 --rna3dmotifs A. B. A. B. C. D.
124 --3dmotifatlas A. B. C. D. A. B. C. D. 133 --3dmotifatlas A. B. C. D. A. B. C. D.
125 --carnaval A. B. 134 --carnaval A. B.
135 +--contacts E. F.
126 136
127 ``` 137 ```
......
...@@ -29,11 +29,11 @@ import pickle ...@@ -29,11 +29,11 @@ import pickle
29 # ================== DEFINITION OF THE PATHS ============================== 29 # ================== DEFINITION OF THE PATHS ==============================
30 30
31 biorseoDir = path.realpath(".") 31 biorseoDir = path.realpath(".")
32 -jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar" 32 +jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
33 bypdir = biorseoDir + "/BayesPairing/bayespairing/src" 33 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
34 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" 34 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
35 -moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py" 35 +moipdir = "/local/local/localopt/RNAMoIP/Src/RNAMoIP.py"
36 -biokopdir = "/home/persalteas/Software/biokop/biokop" 36 +biokopdir = "/local/local/localopt/biokop/biokop"
37 runDir = path.dirname(path.realpath(__file__)) 37 runDir = path.dirname(path.realpath(__file__))
38 bpRNAFile = argv[1] 38 bpRNAFile = argv[1]
39 PseudobaseFile = argv[2] 39 PseudobaseFile = argv[2]
...@@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3): ...@@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3):
1109 if not '(' in struct: 1109 if not '(' in struct:
1110 continue # ignore linear structures 1110 continue # ignore linear structures
1111 if is_canonical_nts(seq) and is_canonical_bps(struct): 1111 if is_canonical_nts(seq) and is_canonical_bps(struct):
1112 + # keeps what's inside brackets at the end as the filename
1112 if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct)) 1113 if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct))
1114 + # keeps what's inside square brackets at the end as the filename
1113 if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct)) 1115 if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct))
1116 + # keeps all the header as filename
1114 if header_style == 3: container.append(RNA(header[1:], header, seq, struct)) 1117 if header_style == 3: container.append(RNA(header[1:], header, seq, struct))
1115 if '[' in struct: counter += 1 1118 if '[' in struct: counter += 1
1116 db.close() 1119 db.close()
...@@ -1475,8 +1478,8 @@ def print_StudyCase_results(): ...@@ -1475,8 +1478,8 @@ def print_StudyCase_results():
1475 if __name__ == '__main__': 1478 if __name__ == '__main__':
1476 1479
1477 print("> Loading files...", flush=True) 1480 print("> Loading files...", flush=True)
1478 - bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile) 1481 + bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile, header_style=1)
1479 - PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile) 1482 + PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile, header_style=3)
1480 StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1) 1483 StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1)
1481 1484
1482 for nt, number in ignored_nt_dict.items(): 1485 for nt, number in ignored_nt_dict.items():
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
...@@ -37,6 +37,7 @@ class MOIP ...@@ -37,6 +37,7 @@ class MOIP
37 void forbid_solutions_between(double min, double max); 37 void forbid_solutions_between(double min, double max);
38 IloEnv& get_env(void); 38 IloEnv& get_env(void);
39 static char obj_function_nbr_; // On what criteria do you want to insert motifs ? 39 static char obj_function_nbr_; // On what criteria do you want to insert motifs ?
40 + static char obj_function2_nbr_; // Do you want to use MEA or MFE to determine the best energy score ?
40 static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ? 41 static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
41 static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =( 42 static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
42 static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true) 43 static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true)
...@@ -47,8 +48,12 @@ class MOIP ...@@ -47,8 +48,12 @@ class MOIP
47 void define_problem_constraints(string& source); 48 void define_problem_constraints(string& source);
48 size_t get_yuv_index(size_t u, size_t v) const; 49 size_t get_yuv_index(size_t u, size_t v) const;
49 size_t get_Cpxi_index(size_t x_i, size_t i_on_j) const; 50 size_t get_Cpxi_index(size_t x_i, size_t i_on_j) const;
51 + size_t get_xij_index(size_t u, size_t v) const;
52 +
50 IloNumExprArg& y(size_t u, size_t v); // Direct reference to y^u_v in basepair_dv_ 53 IloNumExprArg& y(size_t u, size_t v); // Direct reference to y^u_v in basepair_dv_
51 IloNumExprArg& C(size_t x, size_t i); // Direct reference to C_p^xi in insertion_dv_ 54 IloNumExprArg& C(size_t x, size_t i); // Direct reference to C_p^xi in insertion_dv_
55 + IloNumExprArg& x(size_t u, size_t v); // Direct reference to x_i,j in stacks_dv_
56 +
52 bool exists_vertical_outdated_labels(const SecondaryStructure& s) const; 57 bool exists_vertical_outdated_labels(const SecondaryStructure& s) const;
53 bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const; 58 bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
54 void allowed_motifs_from_desc(args_of_parallel_func arg_struct); 59 void allowed_motifs_from_desc(args_of_parallel_func arg_struct);
...@@ -66,12 +71,16 @@ class MOIP ...@@ -66,12 +71,16 @@ class MOIP
66 IloEnv env_; // environment CPLEX object 71 IloEnv env_; // environment CPLEX object
67 IloNumVarArray basepair_dv_; // Decision variables 72 IloNumVarArray basepair_dv_; // Decision variables
68 IloNumVarArray insertion_dv_; // Decision variables 73 IloNumVarArray insertion_dv_; // Decision variables
74 + IloNumVarArray stacks_dv_; // Decision variables
75 +
69 IloModel model_; // Solver for objective 1 76 IloModel model_; // Solver for objective 1
70 IloExpr obj1; // Objective function that counts inserted motifs 77 IloExpr obj1; // Objective function that counts inserted motifs
71 IloExpr obj2; // Objective function of expected accuracy 78 IloExpr obj2; // Objective function of expected accuracy
72 vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_ 79 vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_
73 vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_ 80 vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_
74 vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_ 81 vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_
82 +
83 + vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
75 }; 84 };
76 85
77 inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); } 86 inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); }
...@@ -79,6 +88,8 @@ inline uint MOIP::get_n_candidates(void) const { return ins ...@@ -79,6 +88,8 @@ inline uint MOIP::get_n_candidates(void) const { return ins
79 inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; } 88 inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; }
80 inline IloNumExprArg& MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; } 89 inline IloNumExprArg& MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; }
81 inline IloNumExprArg& MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; } 90 inline IloNumExprArg& MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; }
91 +inline IloNumExprArg& MOIP::x(size_t u, size_t v) { return stacks_dv_[get_xij_index(u, v)]; }
92 +
82 inline SecondaryStructure MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); } 93 inline SecondaryStructure MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); }
83 inline IloEnv& MOIP::get_env(void) { return env_; } 94 inline IloEnv& MOIP::get_env(void) { return env_; }
84 95
......
This diff is collapsed. Click to expand it.
...@@ -20,13 +20,7 @@ typedef struct Comp_ { ...@@ -20,13 +20,7 @@ typedef struct Comp_ {
20 pair<uint, uint> pos; 20 pair<uint, uint> pos;
21 size_t k; 21 size_t k;
22 string seq_; 22 string seq_;
23 - uint nb_pairing;
24 Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; } 23 Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
25 - Comp_(pair<int, int> p, uint nb_pair) : pos(p)
26 - {
27 - k = 1 + pos.second - pos.first;
28 - nb_pairing = nb_pair;
29 - }
30 Comp_(uint start, uint length) : k(length) 24 Comp_(uint start, uint length) : k(length)
31 { 25 {
32 pos.first = start; 26 pos.first = start;
...@@ -64,6 +58,7 @@ class Motif ...@@ -64,6 +58,7 @@ class Motif
64 string get_identifier(void) const; 58 string get_identifier(void) const;
65 vector<Component> comp; 59 vector<Component> comp;
66 vector<Link> links_; 60 vector<Link> links_;
61 + vector<uint> pos_contacts;
67 62
68 size_t contact_; 63 size_t contact_;
69 double tx_occurrences_; 64 double tx_occurrences_;
...@@ -89,7 +84,19 @@ vector<Motif> load_csv(const string& path); ...@@ -89,7 +84,19 @@ vector<Motif> load_csv(const string& path);
89 vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose); 84 vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose);
90 85
91 vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc); 86 vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc);
92 -vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc, vector<string>& vs); 87 +vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
88 +
89 +// utilities for Json motifs
90 +size_t count_nucleotide(string&);
91 +size_t count_delimiter(string&);
92 +size_t count_contacts(string&);
93 +string check_motif_sequence(string);
94 +bool checkSecondaryStructure(string);
95 +vector<Link> build_motif_pairs(string&, vector<Component>&);
96 +uint find_max_occurrences(string&);
97 +uint find_max_sequence(string&);
98 +vector<string> find_components(string&, string);
99 +vector<uint> find_contacts(vector<string>&, vector<Component>&);
93 100
94 // utilities to compare secondary structures: 101 // utilities to compare secondary structures:
95 bool operator==(const Motif& m1, const Motif& m2); 102 bool operator==(const Motif& m1, const Motif& m2);
......
This diff is collapsed. Click to expand it.
1 -#include <iostream>
2 -#include <sstream>
3 -#include <fstream>
4 -#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 -#include <typeinfo>
6 -#include <set>
7 -#include <algorithm>
8 -#include <cstdio>
9 -#include <vector>
10 -
11 -using namespace std;
12 -using json = nlohmann::json;
13 -
14 -void delete_redundant_pdb(const string& jsonfile, const string& jsontest, const string& jsonoutfile) {
15 - std::ifstream lib(jsonfile);
16 - std::ifstream lib2(jsontest);
17 -
18 - std::ofstream outfile (jsonoutfile);
19 - json new_motif;
20 - json new_id;
21 - json js = json::parse(lib);
22 - json js2 = json::parse(lib2);
23 -
24 - //the list of pfam lists of the motif we want to count the inclusion in other motif
25 - for (auto it = js.begin(); it != js.end(); ++it) {
26 - string id = it.key();
27 - vector<string> list_pdbs;
28 - vector<string> list_pdbs2;
29 - bool is_added = true;
30 -
31 - //cout << "id: " << id << endl;
32 - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
33 - string test = it2.key();
34 -
35 - if (!test.compare("pdb")) {
36 - vector<string> tab = it2.value();
37 - list_pdbs = tab;
38 - /*set<set<string>>::iterator iit;
39 - set<string>::iterator iit2;
40 - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
41 - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
42 - cout << *iit2 << endl;
43 - }
44 - cout << endl << endl;
45 - }*/
46 - } else {
47 - new_id[test] = it2.value();
48 - }
49 - }
50 - //cout << "-------begin---------" << endl;
51 -
52 - for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
53 - string id2 = it3.key();
54 -
55 - //cout << "id: " << id << " / id2: " << id2 << endl;
56 - for (auto it4 = js[id2].begin(); it4 != js[id2].end(); ++it4) {
57 - string test = it4.key();
58 -
59 - if (!test.compare("pdb")) {
60 - vector<string> tab = it4.value();
61 - list_pdbs2 = tab;
62 -
63 - //cout << id << " / " << id2 << endl;
64 - for (uint k = 0; k < list_pdbs2.size(); k++) {
65 - if (count(list_pdbs.begin(), list_pdbs.end(), list_pdbs2[k])) {
66 - is_added = false;
67 - }
68 - //cout << list_pdbs2[k] << endl;
69 - }
70 -
71 - }
72 -
73 - }
74 - //cout << endl;*/
75 - }
76 -
77 -
78 - /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
79 - for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
80 - cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
81 - }
82 - }*/
83 - if (is_added) {
84 - new_id["pdb"] = list_pdbs;
85 - new_motif[id] = new_id;
86 - }
87 - new_id.clear();
88 - //cout << "valeur: " << ite << endl;
89 - /*for (uint i = 0; i < tab_struc.size() ; i++) {
90 - cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
91 - } */
92 - }
93 - outfile << new_motif.dump(4) << endl;
94 - outfile.close();
95 -}
96 -
97 -int main()
98 -{
99 - string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/bibli_test2.json";
100 - string jsontest = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark_test.json";
101 - string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final_test.json";
102 - delete_redundant_pdb(jsonfile, jsontest, out);
103 - return 0;
104 -}
105 -
...@@ -3,11 +3,13 @@ ...@@ -3,11 +3,13 @@
3 #include <algorithm> 3 #include <algorithm>
4 #include <boost/format.hpp> 4 #include <boost/format.hpp>
5 5
6 +#define RESET "\033[0m"
7 +#define RED "\033[31m" /* Red */
8 +
6 using std::abs; 9 using std::abs;
7 using std::cout; 10 using std::cout;
8 using std::endl; 11 using std::endl;
9 12
10 -
11 SecondaryStructure::SecondaryStructure() {} 13 SecondaryStructure::SecondaryStructure() {}
12 14
13 15
...@@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const ...@@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const
98 return res; 100 return res;
99 } 101 }
100 102
103 +string structure_with_contacts(const SecondaryStructure& ss) {
104 + string sequence = ss.rna_.get_seq();
105 + string construct = "";
106 + bool flag;
107 + for (uint i = 0; i < sequence.size(); i++) {
108 + flag = false;
109 + for (const Motif& m : ss.motif_info_) {
110 + for (uint j = 0; j < m.pos_contacts.size(); j++) {
111 + if (m.pos_contacts[j] == i) flag = true;
112 + }
113 + }
114 + if (flag) {
115 + construct += "*";
116 + } else {
117 + construct += ".";
118 + }
119 + }
120 + return construct;
121 +}
122 +
101 string SecondaryStructure::to_string(void) const 123 string SecondaryStructure::to_string(void) const
102 { 124 {
103 string s; 125 string s;
...@@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j) ...@@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j)
119 141
120 void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); } 142 void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
121 143
122 - 144 +void colored_contacts(string sequence, vector<Motif> motif_info_) {
145 + bool flag;
146 + for (uint i = 0; i < sequence.size(); i++) {
147 + flag = false;
148 + for (const Motif& m : motif_info_) {
149 + for (uint j = 0; j < m.pos_contacts.size(); j++) {
150 + if (m.pos_contacts[j] == i) flag = true;
151 + }
152 + }
153 + if (flag) {
154 + cout << RED << sequence[i] << RESET;
155 + } else {
156 + cout << sequence[i];
157 + }
158 + }
159 +}
123 160
124 void SecondaryStructure::print(void) const 161 void SecondaryStructure::print(void) const
125 { 162 {
126 cout << endl; 163 cout << endl;
127 - cout << '\t' << rna_.get_seq() << endl; 164 + cout << '\t';
128 - cout << '\t' << to_string() << endl; 165 + colored_contacts(rna_.get_seq(), motif_info_);
166 + //rna_.get_seq()
167 + cout << endl;
168 + string ss = to_string();
169 + cout << '\t';
170 + colored_contacts(ss, motif_info_);
171 + //cout << ss;
172 + cout << endl;
129 for (const Motif& m : motif_info_) { 173 for (const Motif& m : motif_info_) {
130 uint i = 0; 174 uint i = 0;
131 cout << '\t'; 175 cout << '\t';
......
...@@ -30,7 +30,6 @@ class SecondaryStructure ...@@ -30,7 +30,6 @@ class SecondaryStructure
30 string to_DBN() const; 30 string to_DBN() const;
31 string to_string() const; 31 string to_string() const;
32 32
33 -
34 vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure 33 vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure
35 vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program 34 vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program
36 vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions 35 vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions
...@@ -58,5 +57,7 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti ...@@ -58,5 +57,7 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
58 inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } 57 inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
59 inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } 58 inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
60 59
60 +string structure_with_contacts(const SecondaryStructure& ss);
61 +
61 62
62 #endif // SECONDARY_STRUCTURE_ 63 #endif // SECONDARY_STRUCTURE_
...\ No newline at end of file ...\ No newline at end of file
......
This diff is collapsed. Click to expand it.
No preview for this file type
...@@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose) ...@@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose)
58 pij_(results->i-1,results->j-1) = results->p; 58 pij_(results->i-1,results->j-1) = results->p;
59 results++; 59 results++;
60 } 60 }
61 +
62 + /*define type_*/
63 + type_ = vector<vector<int>>(n_, vector<int>(n_));
64 + for(uint i = 0; i < n_; i++){
65 + for(uint j = 0; j < n_; j++){
66 + if (i < j){
67 + std::stringstream ss;
68 + ss << seq_[i] << seq_[j];
69 + std::string str = ss.str();
70 + if(str.compare("AU") == 0 ){
71 + type_[i][j] = 1;
72 + }
73 + else if(str.compare("CG") == 0 ){
74 + type_[i][j] = 2;
75 +
76 + }
77 + else if(str.compare("GC") == 0 ){
78 + type_[i][j] = 3;
79 + }
80 + else if(str.compare("GU") == 0 ){
81 + type_[i][j] = 4;
82 + }
83 + else if(str.compare("UG") == 0 ){
84 + type_[i][j] = 5;
85 + }
86 + else if(str.compare("UA") == 0 ){
87 + type_[i][j] = 6;
88 + }
89 + else{
90 + type_[i][j] = 0;
91 + }
92 + }
93 + else{
94 + type_[i][j] = 0;
95 + }
96 + }
97 + }
98 +
61 } 99 }
62 100
63 else cerr << "NULL result returned by vrna_pfl_fold" << endl; 101 else cerr << "NULL result returned by vrna_pfl_fold" << endl;
64 } 102 }
65 103
66 -
67 void RNA::print_basepair_p_matrix(float theta) const 104 void RNA::print_basepair_p_matrix(float theta) const
68 { 105 {
69 cout << endl; 106 cout << endl;
......
...@@ -32,6 +32,8 @@ class RNA ...@@ -32,6 +32,8 @@ class RNA
32 uint get_RNA_length(void) const; 32 uint get_RNA_length(void) const;
33 void print_basepair_p_matrix(float theta) const; 33 void print_basepair_p_matrix(float theta) const;
34 34
35 + vector<vector<int>> get_type();
36 +
35 bool verbose_; // Should we print things ? 37 bool verbose_; // Should we print things ?
36 38
37 private: 39 private:
...@@ -41,10 +43,15 @@ class RNA ...@@ -41,10 +43,15 @@ class RNA
41 string seq_; // sequence of the rna with chars 43 string seq_; // sequence of the rna with chars
42 uint n_; // length of the rna 44 uint n_; // length of the rna
43 MatrixXf pij_; // matrix of basepair probabilities 45 MatrixXf pij_; // matrix of basepair probabilities
46 +
47 + vector<vector<int>> type_; //vector of base pair types
44 }; 48 };
45 49
46 inline float RNA::get_pij(int i, int j) { return pij_(i, j); } 50 inline float RNA::get_pij(int i, int j) { return pij_(i, j); }
47 inline uint RNA::get_RNA_length() const { return n_; } 51 inline uint RNA::get_RNA_length() const { return n_; }
48 inline string RNA::get_seq(void) const { return seq_; } 52 inline string RNA::get_seq(void) const { return seq_; }
49 53
54 +inline vector<vector<int>> RNA::get_type() { return type_; }
55 +
56 +
50 #endif 57 #endif
......
1 ->__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376) 1 +>test_CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE__PDB_00376
2 -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA 2 +GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
3 ->__'GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE_'_(PDB_01023) 3 +>test_GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE__PDB_01023
4 -GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa 4 +GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
5 ->__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857) 5 +>test_SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA__PDB_00857
6 -GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA 6 +GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
...\ No newline at end of file ...\ No newline at end of file
......
File mode changed
1 -> JSON1000_extended
2 -AAUAUCCGGGCGUUUAAUCCCGGGAUAAA
...\ No newline at end of file ...\ No newline at end of file
1 +The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA
2 +laboratory. These motifs are made up of RNA fragments linked to proteins.
3 +==================================================================================================================
4 +
5 +Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
6 +The current scripts were created based on this file, and doesn't work with the other older libraries.
7 +
8 +There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
9 +It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one
10 +contains 130 RNA.
11 +
12 +The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'.
13 +They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
14 +
15 +The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
16 +the 'motifs_06-06-2021.json' motifs file.
17 +This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A
18 +is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
19 +
20 +The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
21 +the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
22 +
23 +
24 +
25 +
26 +
27 +
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
File mode changed
File mode changed
File mode changed
File mode changed
No preview for this file type
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 +from math import sqrt, ceil
2 +import numpy as np
3 +import matplotlib.pyplot as plt
4 +import re
5 +import seaborn as sns
6 +import pandas as pd
7 +import matplotlib.pylab as plt
8 +
9 +# Retrieve for each rna the best value for MEA and compare this energy value with the one obtains with
10 +# RNAeval and RNAfold from the ViennaRNA Package 2.0 (Ronny Lorentz et al., 2011)
11 +# After getting those values, it will creates a figure.
12 +def get_result_MEA(filename):
13 + ext = "json_pmE"
14 + file2 = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/" + filename + ext, "r")
15 +
16 + name = file2.readline()
17 + rna = file2.readline()
18 + twod = file2.readline()
19 + pred = re.findall(r'\S+', twod)
20 +
21 + score = '-' + pred[len(pred)-1]
22 + min = float(score)
23 + contacts = file2.readline()
24 + while twod:
25 + twod = file2.readline()
26 + pred = re.findall(r'\S+', twod)
27 + if len(pred) > 0:
28 + score = '-' + pred[len(pred) - 1]
29 + if float(score) < min:
30 + min = float(score)
31 + contacts = file2.readline()
32 + file2.close()
33 + return min
34 +
35 +fileMFE = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAfold_bm.log", "r")
36 +lineRna = fileMFE.readline()
37 +lineStruct = fileMFE.readline()
38 +
39 +fileEval = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAeval_bm.log", "r")
40 +lineRna2 = fileEval.readline()
41 +lineStruct2 = fileEval.readline()
42 +
43 +file = open("/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn", "r")
44 +name = file.readline().strip()
45 +rna = file.readline()
46 +twod = file.readline()
47 +contacts = file.readline()
48 +list_name = []
49 +list_score = []
50 +list_type = []
51 +print(np)
52 +while name:
53 + #print(name)
54 + if lineRna != rna:
55 + while lineRna != rna:
56 + lineRna = fileMFE.readline()
57 + lineStruct = fileMFE.readline()
58 + MFE = float(lineStruct[len(lineStruct)-8:len(lineStruct)-2])
59 + list_name.append(name[5:len(name)-1])
60 + list_score.append(MFE)
61 + list_type.append('MFE')
62 + #print("MFE:" + str(MFE))
63 + lineRna = fileMFE.readline()
64 + lineStruct = fileMFE.readline()
65 +
66 + if lineRna2 != rna:
67 + while lineRna2 != rna:
68 + lineRna2 = fileEval.readline()
69 + lineStruct2 = fileEval.readline()
70 + eval = float(lineStruct2[len(lineStruct2)-8:len(lineStruct2)-2])
71 + list_name.append(name[5:len(name) - 1])
72 + list_score.append(eval)
73 + list_type.append('eval')
74 + #print("Eval:" + str(eval))
75 + lineRna2 = fileEval.readline()
76 + lineStruct2 = fileEval.readline()
77 +
78 + best_mea = get_result_MEA(name)
79 + #print("MEA: " + str(best_mea) + "\n")
80 + list_name.append(name[5:len(name) - 1])
81 + list_score.append(best_mea)
82 + list_type.append('MEA')
83 + name = file.readline().strip()
84 + rna = file.readline()
85 + twod = file.readline()
86 + contacts = file.readline()
87 +
88 +file.close()
89 +fileMFE.close()
90 +fileEval.close()
91 +
92 +'''print(list_MFE)
93 +print(list_MEA)
94 +print(list_eval)'''
95 +
96 +#np = [["rna", "type_score", "score"]]
97 +d = {'rna':list_name,'score':list_score, 'type_score':list_type}
98 +df = pd.DataFrame(d, columns=['rna','type_score','score'])
99 +
100 +sns.stripplot(x="rna",y="score",data=df,jitter=True,hue='type_score',palette='Set1')
101 +plt.xticks(rotation=90)
102 +plt.savefig("compare_BiORSEOMEA_RNAeval_RNAfold.png")
103 +
104 +
1 +#include <iostream>
2 +#include <sstream>
3 +#include <fstream>
4 +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 +#include <typeinfo>
6 +#include <set>
7 +#include <algorithm>
8 +#include <cstdio>
9 +#include <vector>
10 +
11 +using namespace std;
12 +using json = nlohmann::json;
13 +
14 +//Count the number of '&' in the motif sequence
15 +size_t count_delimiter(string& seq) {
16 + size_t count = 0;
17 + for(uint i = 0; i < seq.size(); i++) {
18 + char c = seq.at(i);
19 + if (c == '&') {
20 + count++;
21 + }
22 + }
23 + return count;
24 +}
25 +
26 +/*
27 +If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts',
28 +th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
29 +*/
30 +void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
31 + std::ifstream lib(jsonfile);
32 +
33 + std::ofstream outfile (jsonoutfile);
34 + json new_motif;
35 + json new_id;
36 +
37 + json js = json::parse(lib);
38 +
39 + //the list of pfam lists of the motif we want to count the inclusion in other motif
40 + for (auto it = js.begin(); it != js.end(); ++it) {
41 + string id = it.key();
42 + string test;
43 + string sequence;
44 + string contacts;
45 + bool is_change = false;
46 +
47 + //cout << "id: " << id << endl;
48 + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
49 + test = it2.key();
50 +
51 + if (!test.compare("sequence")) {
52 + //cout << "sequence: " << it2.value() << endl;
53 + sequence = it2.value();
54 + new_id[test] = it2.value();
55 +
56 + } else if (!test.compare("contacts") ) {
57 + contacts = it2.value();
58 + } else {
59 + new_id[test] = it2.value();
60 + }
61 + }
62 + string tmp = "";
63 + if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
64 + for (uint i = 0; i < sequence.size(); i++) {
65 + if (sequence.at(i) == '&') {
66 + tmp += "&";
67 + } else {
68 + tmp += contacts.at(i);
69 + }
70 + }
71 + } else {
72 + tmp = contacts;
73 + }
74 + new_id["contacts"] = tmp;
75 + new_motif[id] = new_id;
76 + new_id.clear();
77 + }
78 + outfile << new_motif.dump(4) << endl;
79 + outfile.close();
80 +
81 +}
82 +
83 +int main()
84 +{
85 + string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
86 + string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
87 + add_delimiter(jsonfile, out);
88 + return 0;
89 +}
90 +
...@@ -29,7 +29,7 @@ import pickle ...@@ -29,7 +29,7 @@ import pickle
29 # ================== DEFINITION OF THE PATHS ============================== 29 # ================== DEFINITION OF THE PATHS ==============================
30 30
31 biorseoDir = path.realpath(".") 31 biorseoDir = path.realpath(".")
32 -jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar" 32 +jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
33 bypdir = biorseoDir + "/BayesPairing/bayespairing/src" 33 bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
34 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" 34 byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
35 moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py" 35 moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
...@@ -803,7 +803,7 @@ class Method: ...@@ -803,7 +803,7 @@ class Method:
803 else: 803 else:
804 results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}" 804 results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}"
805 c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"] 805 c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"]
806 - c += ["-o", results_file, "--func", self.func] 806 + c += ["-o", results_file, "--func", self.func, "--MFE"]
807 if not self.allow_pk: 807 if not self.allow_pk:
808 c += ["-n"] 808 c += ["-n"]
809 self.joblist.append(Job(command=c, priority=4, timeout=3600, 809 self.joblist.append(Job(command=c, priority=4, timeout=3600,
......
...@@ -11,6 +11,12 @@ ...@@ -11,6 +11,12 @@
11 using namespace std; 11 using namespace std;
12 using json = nlohmann::json; 12 using json = nlohmann::json;
13 13
14 +/*
15 +This script count the number of "occurrences" of the motif.
16 +So we consider that if the sequence of pattern A is included in pattern B,
17 +then for each inclusion of B we also have an inclusion of A. And vice versa.
18 +*/
19 +
14 //Return true if the first sequence seq1 is included in the second sequence seq2 20 //Return true if the first sequence seq1 is included in the second sequence seq2
15 //if not return false 21 //if not return false
16 int is_contains(string& seq1, string& seq2) { 22 int is_contains(string& seq1, string& seq2) {
...@@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) { ...@@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) {
38 44
39 //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B, 45 //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B,
40 //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list. 46 //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list.
47 +//The pattern A is counted only once in every other pattern, i.e. even if the sequence of A is found several times in B,
48 +// it will be added only once in the occurrences of A.
41 void counting_occurences(const string& jsonfile, const string& jsonoutfile) { 49 void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
42 std::ifstream lib(jsonfile); 50 std::ifstream lib(jsonfile);
43 std::ifstream lib2(jsonfile); 51 std::ifstream lib2(jsonfile);
...@@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
73 if (!test.compare("pfam")) { 81 if (!test.compare("pfam")) {
74 vector<vector<string>> tab = it2.value(); 82 vector<vector<string>> tab = it2.value();
75 list_pfams = tab; 83 list_pfams = tab;
76 - /*set<set<string>>::iterator iit;
77 - set<string>::iterator iit2;
78 - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
79 - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
80 - cout << *iit2 << endl;
81 - }
82 - cout << endl << endl;
83 - }*/
84 } else if (!test.compare("sequence")) { 84 } else if (!test.compare("sequence")) {
85 //cout << "sequence: " << it2.value() << endl; 85 //cout << "sequence: " << it2.value() << endl;
86 sequence = it2.value(); 86 sequence = it2.value();
...@@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
124 new_id[test] = it2.value(); 124 new_id[test] = it2.value();
125 } 125 }
126 } 126 }
127 - //cout << "-------begin---------" << endl;
128 127
129 for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) { 128 for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
130 string id2 = it3.key(); 129 string id2 = it3.key();
...@@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
142 if (!test.compare("pfam")) { 141 if (!test.compare("pfam")) {
143 vector<vector<string>> tab = it4.value(); 142 vector<vector<string>> tab = it4.value();
144 list_pfams2 = tab; 143 list_pfams2 = tab;
145 - /*for (uint k = 0; k < tab2.size(); k++) {
146 - for (uint l = 0; l < tab2[k].size(); l++) {
147 - pfams2.insert(tab2[k][l]);
148 - }
149 - list_pfams2.insert(pfams);
150 - pfams2.clear();
151 - }*/
152 -
153 - /*set<set<string>>::iterator iit;
154 - set<string>::iterator iit2;
155 - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
156 - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
157 - cout << *iit2 << endl;
158 - }
159 - cout << endl << endl;
160 - }*/
161 } else if (!test.compare("occurences")) { 144 } else if (!test.compare("occurences")) {
162 occurences2 = it4.value(); 145 occurences2 = it4.value();
163 //cout << "occurences2: "<< occurences2 << endl; 146 //cout << "occurences2: "<< occurences2 << endl;
...@@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
216 199
217 } 200 }
218 } 201 }
219 - //cout << "----end----" << endl;
220 //} 202 //}
221 } 203 }
222 if(flag) { 204 if(flag) {
...@@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
242 //cout << endl;*/ 224 //cout << endl;*/
243 } 225 }
244 226
245 -
246 - /*for(uint ii = 0; ii < list_pfams.size(); ii++) {
247 - for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
248 - cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
249 - }
250 - }*/
251 227
252 new_id["occurences"] = list_pfams.size(); 228 new_id["occurences"] = list_pfams.size();
253 - new_id["pfam"] = list_pfams; 229 + new_id["pfam"] = list_pfams;
254 -
255 - //cout << "-------ending---------" << endl;
256 new_motif[id] = new_id; 230 new_motif[id] = new_id;
257 new_id.clear(); 231 new_id.clear();
258 - //cout << "valeur: " << ite << endl; 232 +
259 - /*for (uint i = 0; i < tab_struc.size() ; i++) {
260 - cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
261 - } */
262 } 233 }
263 outfile << new_motif.dump(4) << endl; 234 outfile << new_motif.dump(4) << endl;
264 outfile.close(); 235 outfile.close();
...@@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { ...@@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
267 238
268 int main() 239 int main()
269 { 240 {
270 - //183 241 +
271 - //cout << "------------------BEGIN-----------------" << endl; 242 + string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
272 - string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_06-06-2021.json"; 243 + string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
273 - string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
274 counting_occurences(jsonfile, out); 244 counting_occurences(jsonfile, out);
275 245
276 - //cout << "------------------END-----------------" << endl;
277 return 0; 246 return 0;
278 } 247 }
279 248
......
1 +#include <iostream>
2 +#include <sstream>
3 +#include <fstream>
4 +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 +#include <typeinfo>
6 +#include <set>
7 +#include <algorithm>
8 +#include <cstdio>
9 +#include <vector>
10 +
11 +using namespace std;
12 +using json = nlohmann::json;
13 +
14 +/*
15 +Create a .fasta file for each of the sequence inside the benchmark in json format.
16 +Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
17 +Those files are useful for the Isaure_benchmark.py script.
18 +*/
19 +void create_files(const string& jsonmotifs) {
20 + std::ifstream lib(jsonmotifs);
21 + string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
22 + string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
23 + string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
24 + std::ofstream outlist (list);
25 + std::ofstream outdbn (dbn);
26 + json js = json::parse(lib);
27 + uint count = 0;
28 +
29 + for (auto it = js.begin(); it != js.end(); ++it) {
30 + string id = it.key();
31 + string name, seq, contacts, structure;
32 + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
33 + string chain = it2.key();
34 + if (chain.compare("pfams") != 0) {
35 + string name = id + "_" + chain;
36 + string filename = fasta + name + ".fa";
37 + std::ofstream outfasta (filename);
38 + outfasta << ">test_" << name << endl;
39 + for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {
40 + string field = it3.key();
41 + if (!field.compare("sequence")) {
42 + seq = it3.value();
43 + outfasta << seq.substr(0,seq.size()) << endl;
44 + outfasta.close();
45 +
46 + } else if (!field.compare("contacts")) {
47 + contacts = it3.value();
48 +
49 + } else if (!field.compare("struct2d")) {
50 + structure = it3.value();
51 + }
52 + }
53 + if(seq.find('&') == string::npos) {
54 + outlist << ">test_" << name << endl;
55 + outdbn << "test_" << name << "." << endl;
56 + outlist << contacts << endl;
57 + outdbn << seq << endl;
58 + outdbn << structure << endl;
59 + outdbn << contacts << endl;
60 + outlist << seq << endl;
61 + outlist << structure << endl;
62 + count++;
63 + }
64 + }
65 + }
66 + }
67 + cout << count << " sequences en tout" << endl;
68 + lib.close();
69 + outlist.close();
70 + outdbn.close();
71 +}
72 +
73 +int main()
74 +{
75 + string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
76 + string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json";
77 + create_files(jsonbm);
78 +
79 + return 0;
80 +}
81 +
1 +#include <iostream>
2 +#include <sstream>
3 +#include <fstream>
4 +#include "/local/local/BiorseoNath/cppsrc/json.hpp"
5 +#include <typeinfo>
6 +#include <set>
7 +#include <algorithm>
8 +#include <cstdio>
9 +#include <vector>
10 +#include <string>
11 +
12 +using namespace std;
13 +using json = nlohmann::json;
14 +
15 +/*
16 +This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
17 +with BiORSEO.
18 +*/
19 +void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
20 + std::ifstream lib(jsonlibrary);
21 +
22 + std::ofstream outfile (jsonoutfile);
23 + json new_motif;
24 + json new_id;
25 + json js = json::parse(lib);
26 +
27 + for (auto it = js.begin(); it != js.end(); ++it) {
28 + string id = it.key();
29 + vector<string> list_pdbs;
30 + bool is_added = true;
31 +
32 + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
33 + string field = it2.key();
34 +
35 + if (!field.compare("pdb")) {
36 + vector<string> tab = it2.value();
37 + list_pdbs = tab;
38 + } else {
39 + new_id[field] = it2.value();
40 + }
41 + }
42 +
43 + if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
44 + is_added = false;
45 + }
46 + if (is_added) {
47 + new_id["pdb"] = list_pdbs;
48 + new_motif[id] = new_id;
49 + }
50 + new_id.clear();
51 + }
52 + outfile << new_motif.dump(4) << endl;
53 + outfile.close();
54 +}
55 +
56 +int main(int argc, char** argv)
57 +{
58 + string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
59 + string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
60 + string name = argv[1];
61 + delete_redundant_pdb(jsonlibrary, name, out);
62 + return 0;
63 +}
64 +
...@@ -28,17 +28,18 @@ ...@@ -28,17 +28,18 @@
28 from math import sqrt 28 from math import sqrt
29 import numpy as np 29 import numpy as np
30 import matplotlib.pyplot as plt 30 import matplotlib.pyplot as plt
31 -from matplotlib import cm 31 +from matplotlib import cm
32 import scipy.stats as st 32 import scipy.stats as st
33 import sys 33 import sys
34 import os 34 import os
35 import subprocess 35 import subprocess
36 import getopt 36 import getopt
37 37
38 +
38 class SecStruct: 39 class SecStruct:
39 def __init__(self, dot_bracket, obj1_value, obj2_value): 40 def __init__(self, dot_bracket, obj1_value, obj2_value):
40 self.dbn = dot_bracket 41 self.dbn = dot_bracket
41 - self.objectives = [ obj1_value, obj2_value ] 42 + self.objectives = [obj1_value, obj2_value]
42 self.basepair_list = self.get_basepairs() 43 self.basepair_list = self.get_basepairs()
43 self.length = len(dot_bracket) 44 self.length = len(dot_bracket)
44 45
...@@ -96,9 +97,9 @@ class SecStruct: ...@@ -96,9 +97,9 @@ class SecStruct:
96 tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp 97 tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
97 98
98 # Compute MCC 99 # Compute MCC
99 - if (tp+fp == 0): 100 + if (tp + fp == 0):
100 print("We have an issue : no positives detected ! (linear structure)") 101 print("We have an issue : no positives detected ! (linear structure)")
101 - return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) 102 + return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
102 103
103 104
104 class Pareto: 105 class Pareto:
...@@ -106,16 +107,16 @@ class Pareto: ...@@ -106,16 +107,16 @@ class Pareto:
106 self.predictions = list_of_structs 107 self.predictions = list_of_structs
107 self.true_structure = reference 108 self.true_structure = reference
108 self.n_pred = len(list_of_structs) 109 self.n_pred = len(list_of_structs)
109 - self.max_obj1 = max([ s.objectives[0] for s in self.predictions ]) 110 + self.max_obj1 = max([s.objectives[0] for s in self.predictions])
110 - self.max_obj2 = max([ s.objectives[1] for s in self.predictions ]) 111 + self.max_obj2 = max([s.objectives[1] for s in self.predictions])
111 self.index_of_best = self.find_best_solution() 112 self.index_of_best = self.find_best_solution()
112 - 113 +
113 def find_best_solution(self): 114 def find_best_solution(self):
114 # returns the index of the solution of the Pareto set which is the closest 115 # returns the index of the solution of the Pareto set which is the closest
115 # to the real 2D structure (the one with the max MCC) 116 # to the real 2D structure (the one with the max MCC)
116 max_i = -1 117 max_i = -1
117 max_mcc = -1 118 max_mcc = -1
118 - for i,s in enumerate(self.predictions): 119 + for i, s in enumerate(self.predictions):
119 mcc = s.get_MCC_with(self.true_structure) 120 mcc = s.get_MCC_with(self.true_structure)
120 if mcc > max_mcc: 121 if mcc > max_mcc:
121 max_mcc = mcc 122 max_mcc = mcc
...@@ -125,15 +126,15 @@ class Pareto: ...@@ -125,15 +126,15 @@ class Pareto:
125 def get_normalized_coords(self): 126 def get_normalized_coords(self):
126 # retrieves the objective values of the best solution and normlizes them 127 # retrieves the objective values of the best solution and normlizes them
127 coords = self.predictions[self.index_of_best].objectives 128 coords = self.predictions[self.index_of_best].objectives
128 - if self.max_obj1: # avoid divide by zero if all solutions are 0 129 + if self.max_obj1: # avoid divide by zero if all solutions are 0
129 - x = coords[0]/self.max_obj1 130 + x = coords[0] / self.max_obj1
130 else: 131 else:
131 x = 0.5 132 x = 0.5
132 - if self.max_obj2: # avoid divide by zero if all solutions are 0 133 + if self.max_obj2: # avoid divide by zero if all solutions are 0
133 - y = coords[1]/self.max_obj2 134 + y = coords[1] / self.max_obj2
134 else: 135 else:
135 y = 0.5 136 y = 0.5
136 - return ( x, y ) 137 + return (x, y)
137 138
138 139
139 class RNA: 140 class RNA:
...@@ -145,6 +146,8 @@ class RNA: ...@@ -145,6 +146,8 @@ class RNA:
145 146
146 147
147 ignored_nt_dict = {} 148 ignored_nt_dict = {}
149 +
150 +
148 def is_canonical_nts(seq): 151 def is_canonical_nts(seq):
149 for c in seq[:-1]: 152 for c in seq[:-1]:
150 if c not in "ACGU": 153 if c not in "ACGU":
...@@ -155,6 +158,7 @@ def is_canonical_nts(seq): ...@@ -155,6 +158,7 @@ def is_canonical_nts(seq):
155 return False 158 return False
156 return True 159 return True
157 160
161 +
158 def is_canonical_bps(struct): 162 def is_canonical_bps(struct):
159 if "()" in struct: 163 if "()" in struct:
160 return False 164 return False
...@@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3): ...@@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3):
203 db.close() 207 db.close()
204 return container, pkcounter 208 return container, pkcounter
205 209
210 +
206 def parse_biokop(folder, basename, ext=".biok"): 211 def parse_biokop(folder, basename, ext=".biok"):
207 solutions = [] 212 solutions = []
208 err = 0 213 err = 0
...@@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"): ...@@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"):
243 err = 1 248 err = 1
244 return None, err 249 return None, err
245 250
251 +
246 def parse_biorseo(folder, basename, ext): 252 def parse_biorseo(folder, basename, ext):
247 solutions = [] 253 solutions = []
248 err = 0 254 err = 0
...@@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext): ...@@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext):
266 err = 1 272 err = 1
267 return None, err 273 return None, err
268 274
275 +
269 def prettify_biorseo(code): 276 def prettify_biorseo(code):
270 name = "" 277 name = ""
271 if "bgsu" in code: 278 if "bgsu" in code:
...@@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf ...@@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
301 print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2)) 308 print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
302 print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped)) 309 print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped))
303 310
304 - x = np.array([ p[0] for p in points ]) 311 + x = np.array([p[0] for p in points])
305 - y = np.array([ p[1] for p in points ]) 312 + y = np.array([p[1] for p in points])
306 xmin, xmax = 0, 1 313 xmin, xmax = 0, 1
307 ymin, ymax = 0, 1 314 ymin, ymax = 0, 1
308 xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] 315 xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
...@@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf ...@@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
316 ax[pos].axvline(x=1, alpha=0.2, color='black') 323 ax[pos].axvline(x=1, alpha=0.2, color='black')
317 ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5) 324 ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
318 ax[pos].scatter(x, y, s=25, alpha=0.1) 325 ax[pos].scatter(x, y, s=25, alpha=0.1)
319 - ax[pos].set_xlim((-0.1,1.1)) 326 + ax[pos].set_xlim((-0.1, 1.1))
320 - ax[pos].set_ylim((-0.1,1.1)) 327 + ax[pos].set_ylim((-0.1, 1.1))
321 - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer)-skipped)+" RNAs)", (0.08,0.15)) 328 + ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
329 + ax[pos].annotate("(" + str(len(points)) + '/' + str(len(RNAcontainer)-skipped) + " RNAs)", (0.08, 0.15))
322 ax[pos].set_xlabel(xlabel) 330 ax[pos].set_xlabel(xlabel)
323 ax[pos].set_ylabel(ylabel) 331 ax[pos].set_ylabel(ylabel)
324 332
325 if nsolutions: 333 if nsolutions:
326 - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar') 334 + ax[pos + 1].hist(sizes, bins=range(0, max(sizes) + 1, 2), histtype='bar')
327 - ax[pos+1].set_xlim((0,max(sizes)+2)) 335 + ax[pos + 1].set_xlim((0, max(sizes) + 2))
328 - ax[pos+1].set_xticks(range(0, max(sizes), 10)) 336 + ax[pos + 1].set_xticks(range(0, max(sizes), 10))
329 - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90) 337 + ax[pos + 1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
330 - ax[pos+1].set_xlabel("# solutions") 338 + ax[pos + 1].set_xlabel("# solutions")
331 - ax[pos+1].set_ylabel("# RNAs") 339 + ax[pos + 1].set_ylabel("# RNAs")
340 +
332 341
333 if __name__ == "__main__": 342 if __name__ == "__main__":
334 try: 343 try:
......
This diff is collapsed. Click to expand it.
1 +#include <iostream>
2 +#include <sstream>
3 +#include <fstream>
4 +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 +#include <typeinfo>
6 +#include <set>
7 +#include <algorithm>
8 +#include <cstdio>
9 +#include <vector>
10 +
11 +using namespace std;
12 +using json = nlohmann::json;
13 +
14 +/*
15 +That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
16 +*/
17 +
18 +//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
19 +struct data {
20 + //the pdb code (in the name of the sequence)
21 + string pdb;
22 + //the complete sequence with this pdb code
23 + string seq_pdb;
24 + //the id of the motif corresponding to this pdb in the library
25 + string id;
26 + //the module sequence with the components of this motif with the above id
27 + string cmp;
28 +};
29 +typedef struct data data;
30 +
31 +//returns the list of pdb codes and the corresponding information from the benchmark file.
32 +vector<data> get_list_pdb_benchmark(const string& benchmark) {
33 +
34 + fstream bm(benchmark);
35 + vector<data> list_pdb_seq;
36 + if (bm.is_open()) {
37 + string name;
38 + string sequence;
39 + string structure;
40 + string contacts;
41 +
42 + while (getline(bm, name)) {
43 + data d;
44 + int size = name.size();
45 + name = name.substr(5,size-6);
46 + getline(bm, sequence);
47 + d.pdb = name;
48 + d.seq_pdb = sequence;
49 + list_pdb_seq.push_back(d);
50 +
51 + getline(bm, structure);
52 + getline(bm, contacts);
53 + }
54 + bm.close();
55 + }
56 + return list_pdb_seq;
57 +}
58 +
59 +string trim(string str) {
60 + int size = str.size();
61 + str = str.substr(1, size-2);
62 + return str;
63 +}
64 +
65 +//store the corresponding id and motif to the sequence from the benchmark file
66 +data find_id_pattern(string& pdb_pattern, const string& benchmark) {
67 + vector<data> l = get_list_pdb_benchmark(benchmark);
68 + int size = l.size();
69 +
70 + for (data d : l) {
71 + string cmp = d.pdb;
72 + cmp = cmp.substr(0, d.pdb.size()-2);
73 + if (!cmp.compare(pdb_pattern)) {
74 + return d;
75 + }
76 + }
77 + return data();
78 +}
79 +
80 +//Create an array of data ('association'), which consists of each pdb of the benchmark file
81 +// with the associated pattern from this sequence.
82 +vector<data> find_id(const string& bibli, const string& benchmark) {
83 + ifstream lib(bibli);
84 + json js = json::parse(lib);
85 +
86 + //nam seq_bm et id seq_id
87 + vector<data> association;
88 +
89 + for (auto it = js.begin(); it != js.end(); ++it) {
90 + string id = it.key();
91 + data d;
92 +
93 + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
94 + string field = it2.key();
95 + string seq;
96 + if (!field.compare("pdb")) {
97 + int n = js[id][field].size();
98 + for (int i = 0; i < n ; i++) {
99 + ostringstream stream;
100 + stream << js[id][field][i];
101 + string pdb = trim(stream.str());
102 +
103 + d = find_id_pattern(pdb, benchmark);
104 + }
105 + }
106 +
107 + if (!field.compare("sequence")) {
108 + seq = it2.value();
109 +
110 + if (!(d.pdb.empty())) {
111 + d.id = id;
112 + d.cmp = seq;
113 + association.push_back(d);
114 + }
115 + }
116 + }
117 + }
118 + lib.close();
119 + cout << association.size() << endl;
120 + return association;
121 +}
122 +
123 +//check if the motif is found matching with a complete sequence from a benchmark file.
124 +bool does_it_match(const string& seq, const string& seq_motif) {
125 + size_t found = seq_motif.find("&");
126 + size_t size = seq_motif.size();
127 + vector<string> list_cmp;
128 + if (found != std::string::npos) {
129 + int count = 1;
130 +
131 + string cmp = seq_motif.substr(0, found);
132 + list_cmp.push_back(cmp);
133 + while(found != std::string::npos) {
134 + size_t begin = found;
135 + found = seq_motif.find("&", found + 1);
136 + cmp = seq_motif.substr(begin+1, found-begin-1);
137 + list_cmp.push_back(cmp);
138 + count++;
139 + }
140 +
141 + found = seq.find(list_cmp[0]);
142 + int count2 = 1;
143 + while((found != std::string::npos) && (count2 < count)) {
144 + size_t begin = found;
145 + found = seq.find(list_cmp[count2], found + 1);
146 + count2++;
147 + }
148 +
149 + if(count == count2) {
150 + return true;
151 + }
152 +
153 + } else {
154 + found = seq.find(seq_motif);
155 + if (found != std::string::npos) {
156 + return true;
157 + }
158 + }
159 + return false;
160 +}
161 +
162 +//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
163 +vector<string> select_not_motif(const string& bibli, const string& benchmark) {
164 + vector<string> selection;
165 + vector<data> association = find_id(bibli, benchmark);
166 +
167 + for (data d : association) {
168 + selection.push_back(d.id);
169 + }
170 +
171 + for (data d : association) {
172 + for (data d2 : association) {
173 + string seq = d.seq_pdb;
174 + string seq2 = d2.cmp;
175 + bool test = false;
176 +
177 + if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
178 + test = does_it_match(seq, seq2);
179 + if (test) {
180 + cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
181 + auto position = find(selection.begin(), selection.end(), d.id);
182 + if (position != selection.end()) {
183 + int index = position - selection.begin();
184 + selection.erase(selection.begin() + index);
185 + }
186 + }
187 + }
188 + }
189 + }
190 + sort(selection.begin(), selection.end() );
191 + selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
192 +
193 + cout << "size: " << selection.size() << endl;
194 +
195 + return selection;
196 +}
197 +
198 +int main()
199 +{
200 + string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
201 + string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
202 +
203 + /*vector<data> v = get_list_pdb_benchmark(benchmark);
204 + for (data d : v) {
205 + cout << d.pdb << ", " << d.seq_pdb << endl;
206 + }*/
207 +
208 + /*string name = "1U6P_B";
209 + data d = find_id_pattern(name, benchmark);
210 + cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
211 +
212 + /*vector<data> association = find_id(bibli, benchmark);
213 + for (data d : association) {
214 + cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
215 + }*/
216 +
217 + /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
218 + string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
219 + bool test = does_it_match(seq, seq_motif);
220 + cout << test << endl;*/
221 +
222 + vector<string> selection = select_not_motif(bibli, benchmark);
223 + for (string str : selection) {
224 + cout << str << ", ";
225 + }
226 + cout << endl;
227 +
228 + return 0;
229 +}
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
1 +>test
2 +CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG