Merge branch 'stage_NBernard' into 'master'
Stage n bernard results See merge request !1
Showing
75 changed files
with
805 additions
and
241 deletions
1 | -.vscode/* | ||
2 | .vscode | 1 | .vscode |
3 | 2 | ||
4 | -# LaTeX temporary files | ||
5 | -doc/*.toc | ||
6 | -doc/*.bbl | ||
7 | -doc/*.gz | ||
8 | -doc/*.log | ||
9 | -doc/*.aux | ||
10 | -doc/*.blg | ||
11 | -doc/*.fls | ||
12 | -doc/*.fdb_latexmk | ||
13 | - | ||
14 | # Docker installation temporary files | 3 | # Docker installation temporary files |
15 | eigen-eigen-323c052e1731 | 4 | eigen-eigen-323c052e1731 |
16 | cplex_installer_12.8_Student.bin | 5 | cplex_installer_12.8_Student.bin |
... | @@ -20,7 +9,6 @@ ViennaRNA-2.4.13 | ... | @@ -20,7 +9,6 @@ ViennaRNA-2.4.13 |
20 | 9 | ||
21 | # Compiled Object files | 10 | # Compiled Object files |
22 | obj/* | 11 | obj/* |
23 | -doc/*.pdf | ||
24 | data/modules/RIN/__pycache__ | 12 | data/modules/RIN/__pycache__ |
25 | 13 | ||
26 | # Executables | 14 | # Executables |
... | @@ -44,4 +32,4 @@ data/modules/RIN | ... | @@ -44,4 +32,4 @@ data/modules/RIN |
44 | data/modules/ISAURE | 32 | data/modules/ISAURE |
45 | data/sec_structs/bpRNA-1m_90.dbn | 33 | data/sec_structs/bpRNA-1m_90.dbn |
46 | data/sec_structs/pseudobase++.dbn | 34 | data/sec_structs/pseudobase++.dbn |
47 | - | 35 | +data/fasta/contacts | ... | ... |
This diff is collapsed. Click to expand it.
Isaure_benchmark.py
0 → 100644
This diff is collapsed. Click to expand it.
Makefile
100644 → 100755
... | @@ -9,7 +9,7 @@ CC = g++ | ... | @@ -9,7 +9,7 @@ CC = g++ |
9 | CFLAGS = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3 | 9 | CFLAGS = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3 |
10 | CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes | 10 | CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes |
11 | LINKER = g++ | 11 | LINKER = g++ |
12 | -LDFLAGS = -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm | 12 | +LDFLAGS = -Wno-free-nonheap-object -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm |
13 | 13 | ||
14 | # change these to proper directories where each file should be | 14 | # change these to proper directories where each file should be |
15 | SRCDIR = cppsrc | 15 | SRCDIR = cppsrc |
... | @@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES) | ... | @@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES) |
31 | $(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@ | 31 | $(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@ |
32 | @echo -e "\033[00;32mCompiled "$<".\033[00m" | 32 | @echo -e "\033[00;32mCompiled "$<".\033[00m" |
33 | 33 | ||
34 | -doc: mainpdf supppdf | ||
35 | - @echo -e "\033[00;32mLaTeX documentation rendered.\033[00m" | ||
36 | - | ||
37 | -mainpdf: doc/main_bioinformatics.tex doc/references.bib doc/bioinfo.cls doc/natbib.bst | ||
38 | - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics | ||
39 | - cd doc; bibtex main_bioinformatics | ||
40 | - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics | ||
41 | - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics | ||
42 | - | ||
43 | -supppdf: doc/supplementary_material.tex | ||
44 | - cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error supplementary_material | ||
45 | - | ||
46 | .PHONY: all | 34 | .PHONY: all |
47 | -all: $(BINDIR)/$(TARGET) doc | 35 | +all: $(BINDIR)/$(TARGET) |
48 | 36 | ||
49 | .PHONY: re | 37 | .PHONY: re |
50 | re: remove clean all | 38 | re: remove clean all | ... | ... |
... | @@ -19,6 +19,7 @@ THEN | ... | @@ -19,6 +19,7 @@ THEN |
19 | OUTPUT: | 19 | OUTPUT: |
20 | - A set of secondary structures from the Pareto front, | 20 | - A set of secondary structures from the Pareto front, |
21 | - The list of known modules inserted inplace in the corresponding structures | 21 | - The list of known modules inserted inplace in the corresponding structures |
22 | +- A set of positions of the nucleotides in contact with the protein represented by asterisks (only if the motifs_28-05-2021.json library is used!) | ||
22 | 23 | ||
23 | 2/ The different models | 24 | 2/ The different models |
24 | ================================== | 25 | ================================== |
... | @@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet): | ... | @@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet): |
28 | * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*) | 29 | * Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*) |
29 | * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/) | 30 | * The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/) |
30 | * CaRNAval 1.0 (*Reinhartz et al, 2018*) | 31 | * CaRNAval 1.0 (*Reinhartz et al, 2018*) |
31 | -* RNA-Bricks 2, RNAMC, CaRNAval 2.0, and others could theoretically be used, but are not supported (yet). You might write your own API. | 32 | +* /data/modules/ISAURE/motifs_28-05-2021.json a library of motifs from RNA linked to a protein from Isaure Chauvot de Beauchêne of LORIA laboratory |
33 | + (contact:isaure.chauvot-de-beauchene@loria.fr) | ||
32 | 34 | ||
33 | PATTERN MATCHING STEP | 35 | PATTERN MATCHING STEP |
34 | - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards. | 36 | - Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards. |
... | @@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA | ... | @@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA |
43 | * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size. | 45 | * **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size. |
44 | * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score). | 46 | * **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score). |
45 | * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size. | 47 | * **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size. |
48 | +* **Function E** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides in the module. | ||
49 | +* **Function F** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides along the entire length of the RNA. | ||
46 | 50 | ||
47 | 3/ Installation | 51 | 3/ Installation |
48 | ================================== | 52 | ================================== |
... | @@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions. | ... | @@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions. |
55 | 59 | ||
56 | - If you **might expect a pseudoknot, or don't know**: | 60 | - If you **might expect a pseudoknot, or don't know**: |
57 | * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example: | 61 | * The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example: |
58 | - `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A` | 62 | + `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A --MEA` |
59 | 63 | ||
60 | * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example: | 64 | * The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example: |
61 | - `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A | 65 | + `./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A --MEA |
62 | 66 | ||
63 | 5/ List of Options | 67 | 5/ List of Options |
64 | ================================== | 68 | ================================== |
65 | ``` | 69 | ``` |
66 | Usage: You must provide: | 70 | Usage: You must provide: |
67 | 1) a FASTA input file with -i, | 71 | 1) a FASTA input file with -i, |
68 | - 2) a module type with --rna3dmotifs, --carnaval or --3dmotifatlas | 72 | + 2) a module type with --rna3dmotifs, --carnaval, --3dmotifatlas or --contacts |
69 | 3) one module placement method in { --patternmatch, --jar3d, --bayespairing } | 73 | 3) one module placement method in { --patternmatch, --jar3d, --bayespairing } |
70 | - 4) one scoring function with --func A, B, C or D | 74 | + 4) one scoring function with --func A, B, C, D, E ou F |
71 | - | 75 | + 5) one estimator betwenn --MEA or --MFE |
72 | If you are not using the Docker image: | 76 | If you are not using the Docker image: |
73 | - 5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir) | 77 | + 6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir) |
74 | 78 | ||
75 | Options: | 79 | Options: |
76 | -h [ --help ] Print this help message | 80 | -h [ --help ] Print this help message |
... | @@ -79,16 +83,21 @@ Options: | ... | @@ -79,16 +83,21 @@ Options: |
79 | --rna3dmotifs Use DESC modules from Djelloul & Denise, 2008 | 83 | --rna3dmotifs Use DESC modules from Djelloul & Denise, 2008 |
80 | --carnaval Use RIN modules from Reinharz & al, 2018 | 84 | --carnaval Use RIN modules from Reinharz & al, 2018 |
81 | --3dmotifatlas Use the HL and IL loops from BGSU's 3D Motif Atlas (updated) | 85 | --3dmotifatlas Use the HL and IL loops from BGSU's 3D Motif Atlas (updated) |
86 | +--contacts Use the library of motifs, created from RNA sequences linked to proteins provided by I. Chauvot de Beauchene of LORIA laboratory | ||
82 | -p [ --patternmatch ] Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval) | 87 | -p [ --patternmatch ] Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval) |
83 | -j [ --jar3d ] Use JAR3D to place modules in the sequence (requires --3dmotifatlas) | 88 | -j [ --jar3d ] Use JAR3D to place modules in the sequence (requires --3dmotifatlas) |
84 | -b [ --bayespairing ] Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas) | 89 | -b [ --bayespairing ] Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas) |
85 | -o [ --output=… ] File to summarize the results | 90 | -o [ --output=… ] File to summarize the results |
86 | -O [ --outputf=… ] Folder where to output result and temp files | 91 | -O [ --outputf=… ] Folder where to output result and temp files |
87 | --f [ --func=… ] (A, B, C or D, default is B) Objective function to score module insertions: | 92 | +-f [ --func=… ] (A, B, C, D, E or F default is B) Objective function to score module insertions: |
88 | (A) insert big modules (B) insert light, high-order modules | 93 | (A) insert big modules (B) insert light, high-order modules |
89 | - (c) insert modules which score well with the sequence | 94 | + (C) insert modules which score well with the sequence |
90 | (D) insert light, high-order modules which score well with the sequence. | 95 | (D) insert light, high-order modules which score well with the sequence. |
91 | - C and D require cannot be used with --patternmatch. | 96 | + C and D cannot be used with --patternmatch. |
97 | + (E) and (F) insert modules with a lot of nucleotides and a lot of nucleotides in contact with a proteine, and a huge number of occurences. | ||
98 | + (E) maximize the number of contact nucleotide inside the module, while (F) maximize the number of contact nucleotide along the entire length of the RNA. | ||
99 | +--MEA Use Maximum Expected Accuracy for the second objective | ||
100 | +--MFE Use Minimum Free Energy based on the formula of (*Legendre et al., 2018*) for the second objective | ||
92 | -c [ --first-objective=… ] (default 1) Objective to solve in the mono-objective portions of the algorithm. | 101 | -c [ --first-objective=… ] (default 1) Objective to solve in the mono-objective portions of the algorithm. |
93 | (1) is the module objective given by --func, (2) is the expected accuracy of the structure. | 102 | (1) is the module objective given by --func, (2) is the expected accuracy of the structure. |
94 | -l [ --limit=… ] (default 500) Number of solutions in the Pareto set from which | 103 | -l [ --limit=… ] (default 500) Number of solutions in the Pareto set from which |
... | @@ -113,9 +122,9 @@ Options: | ... | @@ -113,9 +122,9 @@ Options: |
113 | BiORSEO from outside the docker image. Use the FULL path. | 122 | BiORSEO from outside the docker image. Use the FULL path. |
114 | 123 | ||
115 | Examples: | 124 | Examples: |
116 | -biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B | 125 | +biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA |
117 | -biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 | 126 | +biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 --MEA |
118 | -biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D | 127 | +biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA |
119 | 128 | ||
120 | The allowed module/placement-method/function combinations are: | 129 | The allowed module/placement-method/function combinations are: |
121 | 130 | ||
... | @@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are: | ... | @@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are: |
123 | --rna3dmotifs A. B. A. B. C. D. | 132 | --rna3dmotifs A. B. A. B. C. D. |
124 | --3dmotifatlas A. B. C. D. A. B. C. D. | 133 | --3dmotifatlas A. B. C. D. A. B. C. D. |
125 | --carnaval A. B. | 134 | --carnaval A. B. |
135 | +--contacts E. F. | ||
126 | 136 | ||
127 | ``` | 137 | ``` | ... | ... |
... | @@ -29,11 +29,11 @@ import pickle | ... | @@ -29,11 +29,11 @@ import pickle |
29 | # ================== DEFINITION OF THE PATHS ============================== | 29 | # ================== DEFINITION OF THE PATHS ============================== |
30 | 30 | ||
31 | biorseoDir = path.realpath(".") | 31 | biorseoDir = path.realpath(".") |
32 | -jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar" | 32 | +jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar" |
33 | bypdir = biorseoDir + "/BayesPairing/bayespairing/src" | 33 | bypdir = biorseoDir + "/BayesPairing/bayespairing/src" |
34 | byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" | 34 | byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" |
35 | -moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py" | 35 | +moipdir = "/local/local/localopt/RNAMoIP/Src/RNAMoIP.py" |
36 | -biokopdir = "/home/persalteas/Software/biokop/biokop" | 36 | +biokopdir = "/local/local/localopt/biokop/biokop" |
37 | runDir = path.dirname(path.realpath(__file__)) | 37 | runDir = path.dirname(path.realpath(__file__)) |
38 | bpRNAFile = argv[1] | 38 | bpRNAFile = argv[1] |
39 | PseudobaseFile = argv[2] | 39 | PseudobaseFile = argv[2] |
... | @@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3): | ... | @@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3): |
1109 | if not '(' in struct: | 1109 | if not '(' in struct: |
1110 | continue # ignore linear structures | 1110 | continue # ignore linear structures |
1111 | if is_canonical_nts(seq) and is_canonical_bps(struct): | 1111 | if is_canonical_nts(seq) and is_canonical_bps(struct): |
1112 | + # keeps what's inside brackets at the end as the filename | ||
1112 | if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct)) | 1113 | if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct)) |
1114 | + # keeps what's inside square brackets at the end as the filename | ||
1113 | if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct)) | 1115 | if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct)) |
1116 | + # keeps all the header as filename | ||
1114 | if header_style == 3: container.append(RNA(header[1:], header, seq, struct)) | 1117 | if header_style == 3: container.append(RNA(header[1:], header, seq, struct)) |
1115 | if '[' in struct: counter += 1 | 1118 | if '[' in struct: counter += 1 |
1116 | db.close() | 1119 | db.close() |
... | @@ -1475,8 +1478,8 @@ def print_StudyCase_results(): | ... | @@ -1475,8 +1478,8 @@ def print_StudyCase_results(): |
1475 | if __name__ == '__main__': | 1478 | if __name__ == '__main__': |
1476 | 1479 | ||
1477 | print("> Loading files...", flush=True) | 1480 | print("> Loading files...", flush=True) |
1478 | - bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile) | 1481 | + bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile, header_style=1) |
1479 | - PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile) | 1482 | + PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile, header_style=3) |
1480 | StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1) | 1483 | StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1) |
1481 | 1484 | ||
1482 | for nt, number in ignored_nt_dict.items(): | 1485 | for nt, number in ignored_nt_dict.items(): | ... | ... |
biorseo.py
deleted
100755 → 0
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
... | @@ -37,6 +37,7 @@ class MOIP | ... | @@ -37,6 +37,7 @@ class MOIP |
37 | void forbid_solutions_between(double min, double max); | 37 | void forbid_solutions_between(double min, double max); |
38 | IloEnv& get_env(void); | 38 | IloEnv& get_env(void); |
39 | static char obj_function_nbr_; // On what criteria do you want to insert motifs ? | 39 | static char obj_function_nbr_; // On what criteria do you want to insert motifs ? |
40 | + static char obj_function2_nbr_; // Do you want to use MEA or MFE to determine the best energy score ? | ||
40 | static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ? | 41 | static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ? |
41 | static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =( | 42 | static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =( |
42 | static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true) | 43 | static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true) |
... | @@ -47,8 +48,12 @@ class MOIP | ... | @@ -47,8 +48,12 @@ class MOIP |
47 | void define_problem_constraints(string& source); | 48 | void define_problem_constraints(string& source); |
48 | size_t get_yuv_index(size_t u, size_t v) const; | 49 | size_t get_yuv_index(size_t u, size_t v) const; |
49 | size_t get_Cpxi_index(size_t x_i, size_t i_on_j) const; | 50 | size_t get_Cpxi_index(size_t x_i, size_t i_on_j) const; |
51 | + size_t get_xij_index(size_t u, size_t v) const; | ||
52 | + | ||
50 | IloNumExprArg& y(size_t u, size_t v); // Direct reference to y^u_v in basepair_dv_ | 53 | IloNumExprArg& y(size_t u, size_t v); // Direct reference to y^u_v in basepair_dv_ |
51 | IloNumExprArg& C(size_t x, size_t i); // Direct reference to C_p^xi in insertion_dv_ | 54 | IloNumExprArg& C(size_t x, size_t i); // Direct reference to C_p^xi in insertion_dv_ |
55 | + IloNumExprArg& x(size_t u, size_t v); // Direct reference to x_i,j in stacks_dv_ | ||
56 | + | ||
52 | bool exists_vertical_outdated_labels(const SecondaryStructure& s) const; | 57 | bool exists_vertical_outdated_labels(const SecondaryStructure& s) const; |
53 | bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const; | 58 | bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const; |
54 | void allowed_motifs_from_desc(args_of_parallel_func arg_struct); | 59 | void allowed_motifs_from_desc(args_of_parallel_func arg_struct); |
... | @@ -66,12 +71,16 @@ class MOIP | ... | @@ -66,12 +71,16 @@ class MOIP |
66 | IloEnv env_; // environment CPLEX object | 71 | IloEnv env_; // environment CPLEX object |
67 | IloNumVarArray basepair_dv_; // Decision variables | 72 | IloNumVarArray basepair_dv_; // Decision variables |
68 | IloNumVarArray insertion_dv_; // Decision variables | 73 | IloNumVarArray insertion_dv_; // Decision variables |
74 | + IloNumVarArray stacks_dv_; // Decision variables | ||
75 | + | ||
69 | IloModel model_; // Solver for objective 1 | 76 | IloModel model_; // Solver for objective 1 |
70 | IloExpr obj1; // Objective function that counts inserted motifs | 77 | IloExpr obj1; // Objective function that counts inserted motifs |
71 | IloExpr obj2; // Objective function of expected accuracy | 78 | IloExpr obj2; // Objective function of expected accuracy |
72 | vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_ | 79 | vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_ |
73 | vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_ | 80 | vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_ |
74 | vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_ | 81 | vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_ |
82 | + | ||
83 | + vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_ | ||
75 | }; | 84 | }; |
76 | 85 | ||
77 | inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); } | 86 | inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); } |
... | @@ -79,6 +88,8 @@ inline uint MOIP::get_n_candidates(void) const { return ins | ... | @@ -79,6 +88,8 @@ inline uint MOIP::get_n_candidates(void) const { return ins |
79 | inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; } | 88 | inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; } |
80 | inline IloNumExprArg& MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; } | 89 | inline IloNumExprArg& MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; } |
81 | inline IloNumExprArg& MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; } | 90 | inline IloNumExprArg& MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; } |
91 | +inline IloNumExprArg& MOIP::x(size_t u, size_t v) { return stacks_dv_[get_xij_index(u, v)]; } | ||
92 | + | ||
82 | inline SecondaryStructure MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); } | 93 | inline SecondaryStructure MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); } |
83 | inline IloEnv& MOIP::get_env(void) { return env_; } | 94 | inline IloEnv& MOIP::get_env(void) { return env_; } |
84 | 95 | ... | ... |
This diff is collapsed. Click to expand it.
... | @@ -20,13 +20,7 @@ typedef struct Comp_ { | ... | @@ -20,13 +20,7 @@ typedef struct Comp_ { |
20 | pair<uint, uint> pos; | 20 | pair<uint, uint> pos; |
21 | size_t k; | 21 | size_t k; |
22 | string seq_; | 22 | string seq_; |
23 | - uint nb_pairing; | ||
24 | Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; } | 23 | Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; } |
25 | - Comp_(pair<int, int> p, uint nb_pair) : pos(p) | ||
26 | - { | ||
27 | - k = 1 + pos.second - pos.first; | ||
28 | - nb_pairing = nb_pair; | ||
29 | - } | ||
30 | Comp_(uint start, uint length) : k(length) | 24 | Comp_(uint start, uint length) : k(length) |
31 | { | 25 | { |
32 | pos.first = start; | 26 | pos.first = start; |
... | @@ -64,6 +58,7 @@ class Motif | ... | @@ -64,6 +58,7 @@ class Motif |
64 | string get_identifier(void) const; | 58 | string get_identifier(void) const; |
65 | vector<Component> comp; | 59 | vector<Component> comp; |
66 | vector<Link> links_; | 60 | vector<Link> links_; |
61 | + vector<uint> pos_contacts; | ||
67 | 62 | ||
68 | size_t contact_; | 63 | size_t contact_; |
69 | double tx_occurrences_; | 64 | double tx_occurrences_; |
... | @@ -89,7 +84,19 @@ vector<Motif> load_csv(const string& path); | ... | @@ -89,7 +84,19 @@ vector<Motif> load_csv(const string& path); |
89 | vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose); | 84 | vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose); |
90 | 85 | ||
91 | vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc); | 86 | vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc); |
92 | -vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc, vector<string>& vs); | 87 | +vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc); |
88 | + | ||
89 | +// utilities for Json motifs | ||
90 | +size_t count_nucleotide(string&); | ||
91 | +size_t count_delimiter(string&); | ||
92 | +size_t count_contacts(string&); | ||
93 | +string check_motif_sequence(string); | ||
94 | +bool checkSecondaryStructure(string); | ||
95 | +vector<Link> build_motif_pairs(string&, vector<Component>&); | ||
96 | +uint find_max_occurrences(string&); | ||
97 | +uint find_max_sequence(string&); | ||
98 | +vector<string> find_components(string&, string); | ||
99 | +vector<uint> find_contacts(vector<string>&, vector<Component>&); | ||
93 | 100 | ||
94 | // utilities to compare secondary structures: | 101 | // utilities to compare secondary structures: |
95 | bool operator==(const Motif& m1, const Motif& m2); | 102 | bool operator==(const Motif& m1, const Motif& m2); | ... | ... |
cppsrc/Scripts/count_pattern.cpp
deleted
100644 → 0
This diff is collapsed. Click to expand it.
cppsrc/Scripts/delete_same_pdb.cpp
deleted
100644 → 0
1 | -#include <iostream> | ||
2 | -#include <sstream> | ||
3 | -#include <fstream> | ||
4 | -#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp" | ||
5 | -#include <typeinfo> | ||
6 | -#include <set> | ||
7 | -#include <algorithm> | ||
8 | -#include <cstdio> | ||
9 | -#include <vector> | ||
10 | - | ||
11 | -using namespace std; | ||
12 | -using json = nlohmann::json; | ||
13 | - | ||
14 | -void delete_redundant_pdb(const string& jsonfile, const string& jsontest, const string& jsonoutfile) { | ||
15 | - std::ifstream lib(jsonfile); | ||
16 | - std::ifstream lib2(jsontest); | ||
17 | - | ||
18 | - std::ofstream outfile (jsonoutfile); | ||
19 | - json new_motif; | ||
20 | - json new_id; | ||
21 | - json js = json::parse(lib); | ||
22 | - json js2 = json::parse(lib2); | ||
23 | - | ||
24 | - //the list of pfam lists of the motif we want to count the inclusion in other motif | ||
25 | - for (auto it = js.begin(); it != js.end(); ++it) { | ||
26 | - string id = it.key(); | ||
27 | - vector<string> list_pdbs; | ||
28 | - vector<string> list_pdbs2; | ||
29 | - bool is_added = true; | ||
30 | - | ||
31 | - //cout << "id: " << id << endl; | ||
32 | - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { | ||
33 | - string test = it2.key(); | ||
34 | - | ||
35 | - if (!test.compare("pdb")) { | ||
36 | - vector<string> tab = it2.value(); | ||
37 | - list_pdbs = tab; | ||
38 | - /*set<set<string>>::iterator iit; | ||
39 | - set<string>::iterator iit2; | ||
40 | - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) { | ||
41 | - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) { | ||
42 | - cout << *iit2 << endl; | ||
43 | - } | ||
44 | - cout << endl << endl; | ||
45 | - }*/ | ||
46 | - } else { | ||
47 | - new_id[test] = it2.value(); | ||
48 | - } | ||
49 | - } | ||
50 | - //cout << "-------begin---------" << endl; | ||
51 | - | ||
52 | - for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) { | ||
53 | - string id2 = it3.key(); | ||
54 | - | ||
55 | - //cout << "id: " << id << " / id2: " << id2 << endl; | ||
56 | - for (auto it4 = js[id2].begin(); it4 != js[id2].end(); ++it4) { | ||
57 | - string test = it4.key(); | ||
58 | - | ||
59 | - if (!test.compare("pdb")) { | ||
60 | - vector<string> tab = it4.value(); | ||
61 | - list_pdbs2 = tab; | ||
62 | - | ||
63 | - //cout << id << " / " << id2 << endl; | ||
64 | - for (uint k = 0; k < list_pdbs2.size(); k++) { | ||
65 | - if (count(list_pdbs.begin(), list_pdbs.end(), list_pdbs2[k])) { | ||
66 | - is_added = false; | ||
67 | - } | ||
68 | - //cout << list_pdbs2[k] << endl; | ||
69 | - } | ||
70 | - | ||
71 | - } | ||
72 | - | ||
73 | - } | ||
74 | - //cout << endl;*/ | ||
75 | - } | ||
76 | - | ||
77 | - | ||
78 | - /*for(uint ii = 0; ii < list_pfams.size(); ii++) { | ||
79 | - for (uint jj = 0; jj < list_pfams[ii].size(); jj++) { | ||
80 | - cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl; | ||
81 | - } | ||
82 | - }*/ | ||
83 | - if (is_added) { | ||
84 | - new_id["pdb"] = list_pdbs; | ||
85 | - new_motif[id] = new_id; | ||
86 | - } | ||
87 | - new_id.clear(); | ||
88 | - //cout << "valeur: " << ite << endl; | ||
89 | - /*for (uint i = 0; i < tab_struc.size() ; i++) { | ||
90 | - cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl; | ||
91 | - } */ | ||
92 | - } | ||
93 | - outfile << new_motif.dump(4) << endl; | ||
94 | - outfile.close(); | ||
95 | -} | ||
96 | - | ||
97 | -int main() | ||
98 | -{ | ||
99 | - string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/bibli_test2.json"; | ||
100 | - string jsontest = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark_test.json"; | ||
101 | - string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final_test.json"; | ||
102 | - delete_redundant_pdb(jsonfile, jsontest, out); | ||
103 | - return 0; | ||
104 | -} | ||
105 | - |
... | @@ -3,11 +3,13 @@ | ... | @@ -3,11 +3,13 @@ |
3 | #include <algorithm> | 3 | #include <algorithm> |
4 | #include <boost/format.hpp> | 4 | #include <boost/format.hpp> |
5 | 5 | ||
6 | +#define RESET "\033[0m" | ||
7 | +#define RED "\033[31m" /* Red */ | ||
8 | + | ||
6 | using std::abs; | 9 | using std::abs; |
7 | using std::cout; | 10 | using std::cout; |
8 | using std::endl; | 11 | using std::endl; |
9 | 12 | ||
10 | - | ||
11 | SecondaryStructure::SecondaryStructure() {} | 13 | SecondaryStructure::SecondaryStructure() {} |
12 | 14 | ||
13 | 15 | ||
... | @@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const | ... | @@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const |
98 | return res; | 100 | return res; |
99 | } | 101 | } |
100 | 102 | ||
103 | +string structure_with_contacts(const SecondaryStructure& ss) { | ||
104 | + string sequence = ss.rna_.get_seq(); | ||
105 | + string construct = ""; | ||
106 | + bool flag; | ||
107 | + for (uint i = 0; i < sequence.size(); i++) { | ||
108 | + flag = false; | ||
109 | + for (const Motif& m : ss.motif_info_) { | ||
110 | + for (uint j = 0; j < m.pos_contacts.size(); j++) { | ||
111 | + if (m.pos_contacts[j] == i) flag = true; | ||
112 | + } | ||
113 | + } | ||
114 | + if (flag) { | ||
115 | + construct += "*"; | ||
116 | + } else { | ||
117 | + construct += "."; | ||
118 | + } | ||
119 | + } | ||
120 | + return construct; | ||
121 | +} | ||
122 | + | ||
101 | string SecondaryStructure::to_string(void) const | 123 | string SecondaryStructure::to_string(void) const |
102 | { | 124 | { |
103 | string s; | 125 | string s; |
... | @@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j) | ... | @@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j) |
119 | 141 | ||
120 | void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); } | 142 | void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); } |
121 | 143 | ||
122 | - | 144 | +void colored_contacts(string sequence, vector<Motif> motif_info_) { |
145 | + bool flag; | ||
146 | + for (uint i = 0; i < sequence.size(); i++) { | ||
147 | + flag = false; | ||
148 | + for (const Motif& m : motif_info_) { | ||
149 | + for (uint j = 0; j < m.pos_contacts.size(); j++) { | ||
150 | + if (m.pos_contacts[j] == i) flag = true; | ||
151 | + } | ||
152 | + } | ||
153 | + if (flag) { | ||
154 | + cout << RED << sequence[i] << RESET; | ||
155 | + } else { | ||
156 | + cout << sequence[i]; | ||
157 | + } | ||
158 | + } | ||
159 | +} | ||
123 | 160 | ||
124 | void SecondaryStructure::print(void) const | 161 | void SecondaryStructure::print(void) const |
125 | { | 162 | { |
126 | cout << endl; | 163 | cout << endl; |
127 | - cout << '\t' << rna_.get_seq() << endl; | 164 | + cout << '\t'; |
128 | - cout << '\t' << to_string() << endl; | 165 | + colored_contacts(rna_.get_seq(), motif_info_); |
166 | + //rna_.get_seq() | ||
167 | + cout << endl; | ||
168 | + string ss = to_string(); | ||
169 | + cout << '\t'; | ||
170 | + colored_contacts(ss, motif_info_); | ||
171 | + //cout << ss; | ||
172 | + cout << endl; | ||
129 | for (const Motif& m : motif_info_) { | 173 | for (const Motif& m : motif_info_) { |
130 | uint i = 0; | 174 | uint i = 0; |
131 | cout << '\t'; | 175 | cout << '\t'; | ... | ... |
... | @@ -30,7 +30,6 @@ class SecondaryStructure | ... | @@ -30,7 +30,6 @@ class SecondaryStructure |
30 | string to_DBN() const; | 30 | string to_DBN() const; |
31 | string to_string() const; | 31 | string to_string() const; |
32 | 32 | ||
33 | - | ||
34 | vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure | 33 | vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure |
35 | vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program | 34 | vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program |
36 | vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions | 35 | vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions |
... | @@ -58,5 +57,7 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti | ... | @@ -58,5 +57,7 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti |
58 | inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } | 57 | inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } |
59 | inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } | 58 | inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } |
60 | 59 | ||
60 | +string structure_with_contacts(const SecondaryStructure& ss); | ||
61 | + | ||
61 | 62 | ||
62 | #endif // SECONDARY_STRUCTURE_ | 63 | #endif // SECONDARY_STRUCTURE_ |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
This diff is collapsed. Click to expand it.
cppsrc/program
0 → 100644
No preview for this file type
... | @@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose) | ... | @@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose) |
58 | pij_(results->i-1,results->j-1) = results->p; | 58 | pij_(results->i-1,results->j-1) = results->p; |
59 | results++; | 59 | results++; |
60 | } | 60 | } |
61 | + | ||
62 | + /*define type_*/ | ||
63 | + type_ = vector<vector<int>>(n_, vector<int>(n_)); | ||
64 | + for(uint i = 0; i < n_; i++){ | ||
65 | + for(uint j = 0; j < n_; j++){ | ||
66 | + if (i < j){ | ||
67 | + std::stringstream ss; | ||
68 | + ss << seq_[i] << seq_[j]; | ||
69 | + std::string str = ss.str(); | ||
70 | + if(str.compare("AU") == 0 ){ | ||
71 | + type_[i][j] = 1; | ||
72 | + } | ||
73 | + else if(str.compare("CG") == 0 ){ | ||
74 | + type_[i][j] = 2; | ||
75 | + | ||
76 | + } | ||
77 | + else if(str.compare("GC") == 0 ){ | ||
78 | + type_[i][j] = 3; | ||
79 | + } | ||
80 | + else if(str.compare("GU") == 0 ){ | ||
81 | + type_[i][j] = 4; | ||
82 | + } | ||
83 | + else if(str.compare("UG") == 0 ){ | ||
84 | + type_[i][j] = 5; | ||
85 | + } | ||
86 | + else if(str.compare("UA") == 0 ){ | ||
87 | + type_[i][j] = 6; | ||
88 | + } | ||
89 | + else{ | ||
90 | + type_[i][j] = 0; | ||
91 | + } | ||
92 | + } | ||
93 | + else{ | ||
94 | + type_[i][j] = 0; | ||
95 | + } | ||
96 | + } | ||
97 | + } | ||
98 | + | ||
61 | } | 99 | } |
62 | 100 | ||
63 | else cerr << "NULL result returned by vrna_pfl_fold" << endl; | 101 | else cerr << "NULL result returned by vrna_pfl_fold" << endl; |
64 | } | 102 | } |
65 | 103 | ||
66 | - | ||
67 | void RNA::print_basepair_p_matrix(float theta) const | 104 | void RNA::print_basepair_p_matrix(float theta) const |
68 | { | 105 | { |
69 | cout << endl; | 106 | cout << endl; | ... | ... |
... | @@ -32,6 +32,8 @@ class RNA | ... | @@ -32,6 +32,8 @@ class RNA |
32 | uint get_RNA_length(void) const; | 32 | uint get_RNA_length(void) const; |
33 | void print_basepair_p_matrix(float theta) const; | 33 | void print_basepair_p_matrix(float theta) const; |
34 | 34 | ||
35 | + vector<vector<int>> get_type(); | ||
36 | + | ||
35 | bool verbose_; // Should we print things ? | 37 | bool verbose_; // Should we print things ? |
36 | 38 | ||
37 | private: | 39 | private: |
... | @@ -41,10 +43,15 @@ class RNA | ... | @@ -41,10 +43,15 @@ class RNA |
41 | string seq_; // sequence of the rna with chars | 43 | string seq_; // sequence of the rna with chars |
42 | uint n_; // length of the rna | 44 | uint n_; // length of the rna |
43 | MatrixXf pij_; // matrix of basepair probabilities | 45 | MatrixXf pij_; // matrix of basepair probabilities |
46 | + | ||
47 | + vector<vector<int>> type_; //vector of base pair types | ||
44 | }; | 48 | }; |
45 | 49 | ||
46 | inline float RNA::get_pij(int i, int j) { return pij_(i, j); } | 50 | inline float RNA::get_pij(int i, int j) { return pij_(i, j); } |
47 | inline uint RNA::get_RNA_length() const { return n_; } | 51 | inline uint RNA::get_RNA_length() const { return n_; } |
48 | inline string RNA::get_seq(void) const { return seq_; } | 52 | inline string RNA::get_seq(void) const { return seq_; } |
49 | 53 | ||
54 | +inline vector<vector<int>> RNA::get_type() { return type_; } | ||
55 | + | ||
56 | + | ||
50 | #endif | 57 | #endif | ... | ... |
data/fasta/applications.fa
100644 → 100755
1 | ->__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376) | 1 | +>test_CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE__PDB_00376 |
2 | -GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA | 2 | +GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA |
3 | ->__'GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE_'_(PDB_01023) | 3 | +>test_GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE__PDB_01023 |
4 | -GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa | 4 | +GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa |
5 | ->__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857) | 5 | +>test_SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA__PDB_00857 |
6 | -GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA | 6 | +GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
data/fasta/example.fa
100644 → 100755
File mode changed
data/fasta/motif_test.fa
deleted
100644 → 0
data/modules/ISAURE/Readme.md
0 → 100644
1 | +The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA | ||
2 | +laboratory. These motifs are made up of RNA fragments linked to proteins. | ||
3 | +================================================================================================================== | ||
4 | + | ||
5 | +Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json' | ||
6 | +The current scripts were created based on this file, and doesn't work with the other older libraries. | ||
7 | + | ||
8 | +There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'. | ||
9 | +It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one | ||
10 | +contains 130 RNA. | ||
11 | + | ||
12 | +The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'. | ||
13 | +They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory. | ||
14 | + | ||
15 | +The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on | ||
16 | +the 'motifs_06-06-2021.json' motifs file. | ||
17 | +This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A | ||
18 | +is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa. | ||
19 | + | ||
20 | +The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be | ||
21 | +the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file! | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | + |
data/modules/ISAURE/benchmark.dbn
0 → 100644
This diff is collapsed. Click to expand it.
data/modules/ISAURE/benchmark.json
0 → 100644
This diff is collapsed. Click to expand it.
data/modules/ISAURE/benchmark.txt
0 → 100644
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
data/modules/ISAURE/motifs_01-06-2021.json
0 → 100644
This diff could not be displayed because it is too large.
data/modules/ISAURE/motifs_06-06-2021.json
0 → 100644
This diff could not be displayed because it is too large.
data/modules/ISAURE/motifs_28-05-2021.json
0 → 100644
This diff could not be displayed because it is too large.
data/modules/ISAURE/motifs_final.json
0 → 100644
This diff could not be displayed because it is too large.
data/sec_structs/RNAMoIP_dataset.dbn
100644 → 100755
File mode changed
data/sec_structs/applications.dbn
100644 → 100755
File mode changed
data/sec_structs/bpRNA-1m_90_short.dbn
100644 → 100755
File mode changed
data/sec_structs/example.dbn
100644 → 100755
File mode changed
data/sec_structs/nothing.dbn
100644 → 100755
File mode changed
data/sec_structs/pseudobase++_short.dbn
100644 → 100755
File mode changed
data/sec_structs/pseudoknots.dbn
100644 → 100755
File mode changed
data/sec_structs/secondary_structures_database.dbn
100644 → 100755
File mode changed
data/sec_structs/verified_secondary_structures_database.dbn
100644 → 100755
File mode changed
doc/Benchmark_unconstrained.jpg
deleted
100644 → 0
207 KB
doc/Nmotifs.jpg
deleted
100644 → 0
66.7 KB
doc/Nsol.jpg
deleted
100644 → 0
39.3 KB
doc/OUP_First_SBk_Bot_8401.eps
deleted
100644 → 0
No preview for this file type
doc/algorithm2e.sty
deleted
100644 → 0
This diff is collapsed. Click to expand it.
doc/bioinfo.cls
deleted
100644 → 0
This diff is collapsed. Click to expand it.
doc/fig/Benchmark_avg.jpg
deleted
100644 → 0
217 KB
doc/fig/MOIP_subopt.jpg
deleted
100644 → 0
94 KB
doc/fig/RNA_SSE.png
deleted
100644 → 0
32.4 KB
doc/fig/kernels_B.png
deleted
100644 → 0
56.6 KB
doc/fig/kernels_C.png
deleted
100644 → 0
43.3 KB
doc/fig/kernels_D.png
deleted
100644 → 0
42 KB
doc/fig/pseudoknots.png
deleted
100644 → 0
62.9 KB
doc/graph_abstract.jpg
deleted
100644 → 0
181 KB
doc/kernels_A.jpg
deleted
100644 → 0
123 KB
doc/main_bioinformatics.tex
deleted
100644 → 0
This diff is collapsed. Click to expand it.
doc/supplementary_materials.tex
deleted
100644 → 0
This diff is collapsed. Click to expand it.
scripts/Compare_energy_results.py
0 → 100644
1 | +from math import sqrt, ceil | ||
2 | +import numpy as np | ||
3 | +import matplotlib.pyplot as plt | ||
4 | +import re | ||
5 | +import seaborn as sns | ||
6 | +import pandas as pd | ||
7 | +import matplotlib.pylab as plt | ||
8 | + | ||
9 | +# Retrieve for each rna the best value for MEA and compare this energy value with the one obtains with | ||
10 | +# RNAeval and RNAfold from the ViennaRNA Package 2.0 (Ronny Lorentz et al., 2011) | ||
11 | +# After getting those values, it will creates a figure. | ||
12 | +def get_result_MEA(filename): | ||
13 | + ext = "json_pmE" | ||
14 | + file2 = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/" + filename + ext, "r") | ||
15 | + | ||
16 | + name = file2.readline() | ||
17 | + rna = file2.readline() | ||
18 | + twod = file2.readline() | ||
19 | + pred = re.findall(r'\S+', twod) | ||
20 | + | ||
21 | + score = '-' + pred[len(pred)-1] | ||
22 | + min = float(score) | ||
23 | + contacts = file2.readline() | ||
24 | + while twod: | ||
25 | + twod = file2.readline() | ||
26 | + pred = re.findall(r'\S+', twod) | ||
27 | + if len(pred) > 0: | ||
28 | + score = '-' + pred[len(pred) - 1] | ||
29 | + if float(score) < min: | ||
30 | + min = float(score) | ||
31 | + contacts = file2.readline() | ||
32 | + file2.close() | ||
33 | + return min | ||
34 | + | ||
35 | +fileMFE = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAfold_bm.log", "r") | ||
36 | +lineRna = fileMFE.readline() | ||
37 | +lineStruct = fileMFE.readline() | ||
38 | + | ||
39 | +fileEval = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAeval_bm.log", "r") | ||
40 | +lineRna2 = fileEval.readline() | ||
41 | +lineStruct2 = fileEval.readline() | ||
42 | + | ||
43 | +file = open("/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn", "r") | ||
44 | +name = file.readline().strip() | ||
45 | +rna = file.readline() | ||
46 | +twod = file.readline() | ||
47 | +contacts = file.readline() | ||
48 | +list_name = [] | ||
49 | +list_score = [] | ||
50 | +list_type = [] | ||
51 | +print(np) | ||
52 | +while name: | ||
53 | + #print(name) | ||
54 | + if lineRna != rna: | ||
55 | + while lineRna != rna: | ||
56 | + lineRna = fileMFE.readline() | ||
57 | + lineStruct = fileMFE.readline() | ||
58 | + MFE = float(lineStruct[len(lineStruct)-8:len(lineStruct)-2]) | ||
59 | + list_name.append(name[5:len(name)-1]) | ||
60 | + list_score.append(MFE) | ||
61 | + list_type.append('MFE') | ||
62 | + #print("MFE:" + str(MFE)) | ||
63 | + lineRna = fileMFE.readline() | ||
64 | + lineStruct = fileMFE.readline() | ||
65 | + | ||
66 | + if lineRna2 != rna: | ||
67 | + while lineRna2 != rna: | ||
68 | + lineRna2 = fileEval.readline() | ||
69 | + lineStruct2 = fileEval.readline() | ||
70 | + eval = float(lineStruct2[len(lineStruct2)-8:len(lineStruct2)-2]) | ||
71 | + list_name.append(name[5:len(name) - 1]) | ||
72 | + list_score.append(eval) | ||
73 | + list_type.append('eval') | ||
74 | + #print("Eval:" + str(eval)) | ||
75 | + lineRna2 = fileEval.readline() | ||
76 | + lineStruct2 = fileEval.readline() | ||
77 | + | ||
78 | + best_mea = get_result_MEA(name) | ||
79 | + #print("MEA: " + str(best_mea) + "\n") | ||
80 | + list_name.append(name[5:len(name) - 1]) | ||
81 | + list_score.append(best_mea) | ||
82 | + list_type.append('MEA') | ||
83 | + name = file.readline().strip() | ||
84 | + rna = file.readline() | ||
85 | + twod = file.readline() | ||
86 | + contacts = file.readline() | ||
87 | + | ||
88 | +file.close() | ||
89 | +fileMFE.close() | ||
90 | +fileEval.close() | ||
91 | + | ||
92 | +'''print(list_MFE) | ||
93 | +print(list_MEA) | ||
94 | +print(list_eval)''' | ||
95 | + | ||
96 | +#np = [["rna", "type_score", "score"]] | ||
97 | +d = {'rna':list_name,'score':list_score, 'type_score':list_type} | ||
98 | +df = pd.DataFrame(d, columns=['rna','type_score','score']) | ||
99 | + | ||
100 | +sns.stripplot(x="rna",y="score",data=df,jitter=True,hue='type_score',palette='Set1') | ||
101 | +plt.xticks(rotation=90) | ||
102 | +plt.savefig("compare_BiORSEOMEA_RNAeval_RNAfold.png") | ||
103 | + | ||
104 | + |
scripts/add_delimiter.cpp
0 → 100644
1 | +#include <iostream> | ||
2 | +#include <sstream> | ||
3 | +#include <fstream> | ||
4 | +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp" | ||
5 | +#include <typeinfo> | ||
6 | +#include <set> | ||
7 | +#include <algorithm> | ||
8 | +#include <cstdio> | ||
9 | +#include <vector> | ||
10 | + | ||
11 | +using namespace std; | ||
12 | +using json = nlohmann::json; | ||
13 | + | ||
14 | +//Count the number of '&' in the motif sequence | ||
15 | +size_t count_delimiter(string& seq) { | ||
16 | + size_t count = 0; | ||
17 | + for(uint i = 0; i < seq.size(); i++) { | ||
18 | + char c = seq.at(i); | ||
19 | + if (c == '&') { | ||
20 | + count++; | ||
21 | + } | ||
22 | + } | ||
23 | + return count; | ||
24 | +} | ||
25 | + | ||
26 | +/* | ||
27 | +If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts', | ||
28 | +th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'. | ||
29 | +*/ | ||
30 | +void add_delimiter(const string& jsonfile, const string& jsonoutfile) { | ||
31 | + std::ifstream lib(jsonfile); | ||
32 | + | ||
33 | + std::ofstream outfile (jsonoutfile); | ||
34 | + json new_motif; | ||
35 | + json new_id; | ||
36 | + | ||
37 | + json js = json::parse(lib); | ||
38 | + | ||
39 | + //the list of pfam lists of the motif we want to count the inclusion in other motif | ||
40 | + for (auto it = js.begin(); it != js.end(); ++it) { | ||
41 | + string id = it.key(); | ||
42 | + string test; | ||
43 | + string sequence; | ||
44 | + string contacts; | ||
45 | + bool is_change = false; | ||
46 | + | ||
47 | + //cout << "id: " << id << endl; | ||
48 | + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { | ||
49 | + test = it2.key(); | ||
50 | + | ||
51 | + if (!test.compare("sequence")) { | ||
52 | + //cout << "sequence: " << it2.value() << endl; | ||
53 | + sequence = it2.value(); | ||
54 | + new_id[test] = it2.value(); | ||
55 | + | ||
56 | + } else if (!test.compare("contacts") ) { | ||
57 | + contacts = it2.value(); | ||
58 | + } else { | ||
59 | + new_id[test] = it2.value(); | ||
60 | + } | ||
61 | + } | ||
62 | + string tmp = ""; | ||
63 | + if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) { | ||
64 | + for (uint i = 0; i < sequence.size(); i++) { | ||
65 | + if (sequence.at(i) == '&') { | ||
66 | + tmp += "&"; | ||
67 | + } else { | ||
68 | + tmp += contacts.at(i); | ||
69 | + } | ||
70 | + } | ||
71 | + } else { | ||
72 | + tmp = contacts; | ||
73 | + } | ||
74 | + new_id["contacts"] = tmp; | ||
75 | + new_motif[id] = new_id; | ||
76 | + new_id.clear(); | ||
77 | + } | ||
78 | + outfile << new_motif.dump(4) << endl; | ||
79 | + outfile.close(); | ||
80 | + | ||
81 | +} | ||
82 | + | ||
83 | +int main() | ||
84 | +{ | ||
85 | + string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json"; | ||
86 | + string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json"; | ||
87 | + add_delimiter(jsonfile, out); | ||
88 | + return 0; | ||
89 | +} | ||
90 | + |
... | @@ -29,7 +29,7 @@ import pickle | ... | @@ -29,7 +29,7 @@ import pickle |
29 | # ================== DEFINITION OF THE PATHS ============================== | 29 | # ================== DEFINITION OF THE PATHS ============================== |
30 | 30 | ||
31 | biorseoDir = path.realpath(".") | 31 | biorseoDir = path.realpath(".") |
32 | -jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar" | 32 | +jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar" |
33 | bypdir = biorseoDir + "/BayesPairing/bayespairing/src" | 33 | bypdir = biorseoDir + "/BayesPairing/bayespairing/src" |
34 | byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" | 34 | byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src" |
35 | moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py" | 35 | moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py" |
... | @@ -803,7 +803,7 @@ class Method: | ... | @@ -803,7 +803,7 @@ class Method: |
803 | else: | 803 | else: |
804 | results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}" | 804 | results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}" |
805 | c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"] | 805 | c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"] |
806 | - c += ["-o", results_file, "--func", self.func] | 806 | + c += ["-o", results_file, "--func", self.func, "--MFE"] |
807 | if not self.allow_pk: | 807 | if not self.allow_pk: |
808 | c += ["-n"] | 808 | c += ["-n"] |
809 | self.joblist.append(Job(command=c, priority=4, timeout=3600, | 809 | self.joblist.append(Job(command=c, priority=4, timeout=3600, | ... | ... |
... | @@ -11,6 +11,12 @@ | ... | @@ -11,6 +11,12 @@ |
11 | using namespace std; | 11 | using namespace std; |
12 | using json = nlohmann::json; | 12 | using json = nlohmann::json; |
13 | 13 | ||
14 | +/* | ||
15 | +This script count the number of "occurrences" of the motif. | ||
16 | +So we consider that if the sequence of pattern A is included in pattern B, | ||
17 | +then for each inclusion of B we also have an inclusion of A. And vice versa. | ||
18 | +*/ | ||
19 | + | ||
14 | //Return true if the first sequence seq1 is included in the second sequence seq2 | 20 | //Return true if the first sequence seq1 is included in the second sequence seq2 |
15 | //if not return false | 21 | //if not return false |
16 | int is_contains(string& seq1, string& seq2) { | 22 | int is_contains(string& seq1, string& seq2) { |
... | @@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) { | ... | @@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) { |
38 | 44 | ||
39 | //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B, | 45 | //If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B, |
40 | //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list. | 46 | //remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list. |
47 | +//The pattern A is counted only once in every other pattern, i.e. even if the sequence of A is found several times in B, | ||
48 | +// it will be added only once in the occurrences of A. | ||
41 | void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | 49 | void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
42 | std::ifstream lib(jsonfile); | 50 | std::ifstream lib(jsonfile); |
43 | std::ifstream lib2(jsonfile); | 51 | std::ifstream lib2(jsonfile); |
... | @@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
73 | if (!test.compare("pfam")) { | 81 | if (!test.compare("pfam")) { |
74 | vector<vector<string>> tab = it2.value(); | 82 | vector<vector<string>> tab = it2.value(); |
75 | list_pfams = tab; | 83 | list_pfams = tab; |
76 | - /*set<set<string>>::iterator iit; | ||
77 | - set<string>::iterator iit2; | ||
78 | - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) { | ||
79 | - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) { | ||
80 | - cout << *iit2 << endl; | ||
81 | - } | ||
82 | - cout << endl << endl; | ||
83 | - }*/ | ||
84 | } else if (!test.compare("sequence")) { | 84 | } else if (!test.compare("sequence")) { |
85 | //cout << "sequence: " << it2.value() << endl; | 85 | //cout << "sequence: " << it2.value() << endl; |
86 | sequence = it2.value(); | 86 | sequence = it2.value(); |
... | @@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
124 | new_id[test] = it2.value(); | 124 | new_id[test] = it2.value(); |
125 | } | 125 | } |
126 | } | 126 | } |
127 | - //cout << "-------begin---------" << endl; | ||
128 | 127 | ||
129 | for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) { | 128 | for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) { |
130 | string id2 = it3.key(); | 129 | string id2 = it3.key(); |
... | @@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
142 | if (!test.compare("pfam")) { | 141 | if (!test.compare("pfam")) { |
143 | vector<vector<string>> tab = it4.value(); | 142 | vector<vector<string>> tab = it4.value(); |
144 | list_pfams2 = tab; | 143 | list_pfams2 = tab; |
145 | - /*for (uint k = 0; k < tab2.size(); k++) { | ||
146 | - for (uint l = 0; l < tab2[k].size(); l++) { | ||
147 | - pfams2.insert(tab2[k][l]); | ||
148 | - } | ||
149 | - list_pfams2.insert(pfams); | ||
150 | - pfams2.clear(); | ||
151 | - }*/ | ||
152 | - | ||
153 | - /*set<set<string>>::iterator iit; | ||
154 | - set<string>::iterator iit2; | ||
155 | - for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) { | ||
156 | - for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) { | ||
157 | - cout << *iit2 << endl; | ||
158 | - } | ||
159 | - cout << endl << endl; | ||
160 | - }*/ | ||
161 | } else if (!test.compare("occurences")) { | 144 | } else if (!test.compare("occurences")) { |
162 | occurences2 = it4.value(); | 145 | occurences2 = it4.value(); |
163 | //cout << "occurences2: "<< occurences2 << endl; | 146 | //cout << "occurences2: "<< occurences2 << endl; |
... | @@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
216 | 199 | ||
217 | } | 200 | } |
218 | } | 201 | } |
219 | - //cout << "----end----" << endl; | ||
220 | //} | 202 | //} |
221 | } | 203 | } |
222 | if(flag) { | 204 | if(flag) { |
... | @@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
242 | //cout << endl;*/ | 224 | //cout << endl;*/ |
243 | } | 225 | } |
244 | 226 | ||
245 | - | ||
246 | - /*for(uint ii = 0; ii < list_pfams.size(); ii++) { | ||
247 | - for (uint jj = 0; jj < list_pfams[ii].size(); jj++) { | ||
248 | - cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl; | ||
249 | - } | ||
250 | - }*/ | ||
251 | 227 | ||
252 | new_id["occurences"] = list_pfams.size(); | 228 | new_id["occurences"] = list_pfams.size(); |
253 | - new_id["pfam"] = list_pfams; | 229 | + new_id["pfam"] = list_pfams; |
254 | - | ||
255 | - //cout << "-------ending---------" << endl; | ||
256 | new_motif[id] = new_id; | 230 | new_motif[id] = new_id; |
257 | new_id.clear(); | 231 | new_id.clear(); |
258 | - //cout << "valeur: " << ite << endl; | 232 | + |
259 | - /*for (uint i = 0; i < tab_struc.size() ; i++) { | ||
260 | - cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl; | ||
261 | - } */ | ||
262 | } | 233 | } |
263 | outfile << new_motif.dump(4) << endl; | 234 | outfile << new_motif.dump(4) << endl; |
264 | outfile.close(); | 235 | outfile.close(); |
... | @@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { | ... | @@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) { |
267 | 238 | ||
268 | int main() | 239 | int main() |
269 | { | 240 | { |
270 | - //183 | 241 | + |
271 | - //cout << "------------------BEGIN-----------------" << endl; | 242 | + string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json"; |
272 | - string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_06-06-2021.json"; | 243 | + string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json"; |
273 | - string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json"; | ||
274 | counting_occurences(jsonfile, out); | 244 | counting_occurences(jsonfile, out); |
275 | 245 | ||
276 | - //cout << "------------------END-----------------" << endl; | ||
277 | return 0; | 246 | return 0; |
278 | } | 247 | } |
279 | 248 | ... | ... |
scripts/create_files.cpp
0 → 100644
1 | +#include <iostream> | ||
2 | +#include <sstream> | ||
3 | +#include <fstream> | ||
4 | +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp" | ||
5 | +#include <typeinfo> | ||
6 | +#include <set> | ||
7 | +#include <algorithm> | ||
8 | +#include <cstdio> | ||
9 | +#include <vector> | ||
10 | + | ||
11 | +using namespace std; | ||
12 | +using json = nlohmann::json; | ||
13 | + | ||
14 | +/* | ||
15 | +Create a .fasta file for each of the sequence inside the benchmark in json format. | ||
16 | +Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file. | ||
17 | +Those files are useful for the Isaure_benchmark.py script. | ||
18 | +*/ | ||
19 | +void create_files(const string& jsonmotifs) { | ||
20 | + std::ifstream lib(jsonmotifs); | ||
21 | + string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/"; | ||
22 | + string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt"; | ||
23 | + string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn"; | ||
24 | + std::ofstream outlist (list); | ||
25 | + std::ofstream outdbn (dbn); | ||
26 | + json js = json::parse(lib); | ||
27 | + uint count = 0; | ||
28 | + | ||
29 | + for (auto it = js.begin(); it != js.end(); ++it) { | ||
30 | + string id = it.key(); | ||
31 | + string name, seq, contacts, structure; | ||
32 | + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { | ||
33 | + string chain = it2.key(); | ||
34 | + if (chain.compare("pfams") != 0) { | ||
35 | + string name = id + "_" + chain; | ||
36 | + string filename = fasta + name + ".fa"; | ||
37 | + std::ofstream outfasta (filename); | ||
38 | + outfasta << ">test_" << name << endl; | ||
39 | + for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) { | ||
40 | + string field = it3.key(); | ||
41 | + if (!field.compare("sequence")) { | ||
42 | + seq = it3.value(); | ||
43 | + outfasta << seq.substr(0,seq.size()) << endl; | ||
44 | + outfasta.close(); | ||
45 | + | ||
46 | + } else if (!field.compare("contacts")) { | ||
47 | + contacts = it3.value(); | ||
48 | + | ||
49 | + } else if (!field.compare("struct2d")) { | ||
50 | + structure = it3.value(); | ||
51 | + } | ||
52 | + } | ||
53 | + if(seq.find('&') == string::npos) { | ||
54 | + outlist << ">test_" << name << endl; | ||
55 | + outdbn << "test_" << name << "." << endl; | ||
56 | + outlist << contacts << endl; | ||
57 | + outdbn << seq << endl; | ||
58 | + outdbn << structure << endl; | ||
59 | + outdbn << contacts << endl; | ||
60 | + outlist << seq << endl; | ||
61 | + outlist << structure << endl; | ||
62 | + count++; | ||
63 | + } | ||
64 | + } | ||
65 | + } | ||
66 | + } | ||
67 | + cout << count << " sequences en tout" << endl; | ||
68 | + lib.close(); | ||
69 | + outlist.close(); | ||
70 | + outdbn.close(); | ||
71 | +} | ||
72 | + | ||
73 | +int main() | ||
74 | +{ | ||
75 | + string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/"; | ||
76 | + string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json"; | ||
77 | + create_files(jsonbm); | ||
78 | + | ||
79 | + return 0; | ||
80 | +} | ||
81 | + |
scripts/delete_same_pdb.cpp
0 → 100644
1 | +#include <iostream> | ||
2 | +#include <sstream> | ||
3 | +#include <fstream> | ||
4 | +#include "/local/local/BiorseoNath/cppsrc/json.hpp" | ||
5 | +#include <typeinfo> | ||
6 | +#include <set> | ||
7 | +#include <algorithm> | ||
8 | +#include <cstdio> | ||
9 | +#include <vector> | ||
10 | +#include <string> | ||
11 | + | ||
12 | +using namespace std; | ||
13 | +using json = nlohmann::json; | ||
14 | + | ||
15 | +/* | ||
16 | +This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction | ||
17 | +with BiORSEO. | ||
18 | +*/ | ||
19 | +void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) { | ||
20 | + std::ifstream lib(jsonlibrary); | ||
21 | + | ||
22 | + std::ofstream outfile (jsonoutfile); | ||
23 | + json new_motif; | ||
24 | + json new_id; | ||
25 | + json js = json::parse(lib); | ||
26 | + | ||
27 | + for (auto it = js.begin(); it != js.end(); ++it) { | ||
28 | + string id = it.key(); | ||
29 | + vector<string> list_pdbs; | ||
30 | + bool is_added = true; | ||
31 | + | ||
32 | + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { | ||
33 | + string field = it2.key(); | ||
34 | + | ||
35 | + if (!field.compare("pdb")) { | ||
36 | + vector<string> tab = it2.value(); | ||
37 | + list_pdbs = tab; | ||
38 | + } else { | ||
39 | + new_id[field] = it2.value(); | ||
40 | + } | ||
41 | + } | ||
42 | + | ||
43 | + if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) { | ||
44 | + is_added = false; | ||
45 | + } | ||
46 | + if (is_added) { | ||
47 | + new_id["pdb"] = list_pdbs; | ||
48 | + new_motif[id] = new_id; | ||
49 | + } | ||
50 | + new_id.clear(); | ||
51 | + } | ||
52 | + outfile << new_motif.dump(4) << endl; | ||
53 | + outfile.close(); | ||
54 | +} | ||
55 | + | ||
56 | +int main(int argc, char** argv) | ||
57 | +{ | ||
58 | + string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json"; | ||
59 | + string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json"; | ||
60 | + string name = argv[1]; | ||
61 | + delete_redundant_pdb(jsonlibrary, name, out); | ||
62 | + return 0; | ||
63 | +} | ||
64 | + |
... | @@ -28,17 +28,18 @@ | ... | @@ -28,17 +28,18 @@ |
28 | from math import sqrt | 28 | from math import sqrt |
29 | import numpy as np | 29 | import numpy as np |
30 | import matplotlib.pyplot as plt | 30 | import matplotlib.pyplot as plt |
31 | -from matplotlib import cm | 31 | +from matplotlib import cm |
32 | import scipy.stats as st | 32 | import scipy.stats as st |
33 | import sys | 33 | import sys |
34 | import os | 34 | import os |
35 | import subprocess | 35 | import subprocess |
36 | import getopt | 36 | import getopt |
37 | 37 | ||
38 | + | ||
38 | class SecStruct: | 39 | class SecStruct: |
39 | def __init__(self, dot_bracket, obj1_value, obj2_value): | 40 | def __init__(self, dot_bracket, obj1_value, obj2_value): |
40 | self.dbn = dot_bracket | 41 | self.dbn = dot_bracket |
41 | - self.objectives = [ obj1_value, obj2_value ] | 42 | + self.objectives = [obj1_value, obj2_value] |
42 | self.basepair_list = self.get_basepairs() | 43 | self.basepair_list = self.get_basepairs() |
43 | self.length = len(dot_bracket) | 44 | self.length = len(dot_bracket) |
44 | 45 | ||
... | @@ -96,9 +97,9 @@ class SecStruct: | ... | @@ -96,9 +97,9 @@ class SecStruct: |
96 | tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp | 97 | tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp |
97 | 98 | ||
98 | # Compute MCC | 99 | # Compute MCC |
99 | - if (tp+fp == 0): | 100 | + if (tp + fp == 0): |
100 | print("We have an issue : no positives detected ! (linear structure)") | 101 | print("We have an issue : no positives detected ! (linear structure)") |
101 | - return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) | 102 | + return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) |
102 | 103 | ||
103 | 104 | ||
104 | class Pareto: | 105 | class Pareto: |
... | @@ -106,16 +107,16 @@ class Pareto: | ... | @@ -106,16 +107,16 @@ class Pareto: |
106 | self.predictions = list_of_structs | 107 | self.predictions = list_of_structs |
107 | self.true_structure = reference | 108 | self.true_structure = reference |
108 | self.n_pred = len(list_of_structs) | 109 | self.n_pred = len(list_of_structs) |
109 | - self.max_obj1 = max([ s.objectives[0] for s in self.predictions ]) | 110 | + self.max_obj1 = max([s.objectives[0] for s in self.predictions]) |
110 | - self.max_obj2 = max([ s.objectives[1] for s in self.predictions ]) | 111 | + self.max_obj2 = max([s.objectives[1] for s in self.predictions]) |
111 | self.index_of_best = self.find_best_solution() | 112 | self.index_of_best = self.find_best_solution() |
112 | - | 113 | + |
113 | def find_best_solution(self): | 114 | def find_best_solution(self): |
114 | # returns the index of the solution of the Pareto set which is the closest | 115 | # returns the index of the solution of the Pareto set which is the closest |
115 | # to the real 2D structure (the one with the max MCC) | 116 | # to the real 2D structure (the one with the max MCC) |
116 | max_i = -1 | 117 | max_i = -1 |
117 | max_mcc = -1 | 118 | max_mcc = -1 |
118 | - for i,s in enumerate(self.predictions): | 119 | + for i, s in enumerate(self.predictions): |
119 | mcc = s.get_MCC_with(self.true_structure) | 120 | mcc = s.get_MCC_with(self.true_structure) |
120 | if mcc > max_mcc: | 121 | if mcc > max_mcc: |
121 | max_mcc = mcc | 122 | max_mcc = mcc |
... | @@ -125,15 +126,15 @@ class Pareto: | ... | @@ -125,15 +126,15 @@ class Pareto: |
125 | def get_normalized_coords(self): | 126 | def get_normalized_coords(self): |
126 | # retrieves the objective values of the best solution and normlizes them | 127 | # retrieves the objective values of the best solution and normlizes them |
127 | coords = self.predictions[self.index_of_best].objectives | 128 | coords = self.predictions[self.index_of_best].objectives |
128 | - if self.max_obj1: # avoid divide by zero if all solutions are 0 | 129 | + if self.max_obj1: # avoid divide by zero if all solutions are 0 |
129 | - x = coords[0]/self.max_obj1 | 130 | + x = coords[0] / self.max_obj1 |
130 | else: | 131 | else: |
131 | x = 0.5 | 132 | x = 0.5 |
132 | - if self.max_obj2: # avoid divide by zero if all solutions are 0 | 133 | + if self.max_obj2: # avoid divide by zero if all solutions are 0 |
133 | - y = coords[1]/self.max_obj2 | 134 | + y = coords[1] / self.max_obj2 |
134 | else: | 135 | else: |
135 | y = 0.5 | 136 | y = 0.5 |
136 | - return ( x, y ) | 137 | + return (x, y) |
137 | 138 | ||
138 | 139 | ||
139 | class RNA: | 140 | class RNA: |
... | @@ -145,6 +146,8 @@ class RNA: | ... | @@ -145,6 +146,8 @@ class RNA: |
145 | 146 | ||
146 | 147 | ||
147 | ignored_nt_dict = {} | 148 | ignored_nt_dict = {} |
149 | + | ||
150 | + | ||
148 | def is_canonical_nts(seq): | 151 | def is_canonical_nts(seq): |
149 | for c in seq[:-1]: | 152 | for c in seq[:-1]: |
150 | if c not in "ACGU": | 153 | if c not in "ACGU": |
... | @@ -155,6 +158,7 @@ def is_canonical_nts(seq): | ... | @@ -155,6 +158,7 @@ def is_canonical_nts(seq): |
155 | return False | 158 | return False |
156 | return True | 159 | return True |
157 | 160 | ||
161 | + | ||
158 | def is_canonical_bps(struct): | 162 | def is_canonical_bps(struct): |
159 | if "()" in struct: | 163 | if "()" in struct: |
160 | return False | 164 | return False |
... | @@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3): | ... | @@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3): |
203 | db.close() | 207 | db.close() |
204 | return container, pkcounter | 208 | return container, pkcounter |
205 | 209 | ||
210 | + | ||
206 | def parse_biokop(folder, basename, ext=".biok"): | 211 | def parse_biokop(folder, basename, ext=".biok"): |
207 | solutions = [] | 212 | solutions = [] |
208 | err = 0 | 213 | err = 0 |
... | @@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"): | ... | @@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"): |
243 | err = 1 | 248 | err = 1 |
244 | return None, err | 249 | return None, err |
245 | 250 | ||
251 | + | ||
246 | def parse_biorseo(folder, basename, ext): | 252 | def parse_biorseo(folder, basename, ext): |
247 | solutions = [] | 253 | solutions = [] |
248 | err = 0 | 254 | err = 0 |
... | @@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext): | ... | @@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext): |
266 | err = 1 | 272 | err = 1 |
267 | return None, err | 273 | return None, err |
268 | 274 | ||
275 | + | ||
269 | def prettify_biorseo(code): | 276 | def prettify_biorseo(code): |
270 | name = "" | 277 | name = "" |
271 | if "bgsu" in code: | 278 | if "bgsu" in code: |
... | @@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf | ... | @@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf |
301 | print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2)) | 308 | print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2)) |
302 | print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped)) | 309 | print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped)) |
303 | 310 | ||
304 | - x = np.array([ p[0] for p in points ]) | 311 | + x = np.array([p[0] for p in points]) |
305 | - y = np.array([ p[1] for p in points ]) | 312 | + y = np.array([p[1] for p in points]) |
306 | xmin, xmax = 0, 1 | 313 | xmin, xmax = 0, 1 |
307 | ymin, ymax = 0, 1 | 314 | ymin, ymax = 0, 1 |
308 | xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] | 315 | xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] |
... | @@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf | ... | @@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf |
316 | ax[pos].axvline(x=1, alpha=0.2, color='black') | 323 | ax[pos].axvline(x=1, alpha=0.2, color='black') |
317 | ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5) | 324 | ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5) |
318 | ax[pos].scatter(x, y, s=25, alpha=0.1) | 325 | ax[pos].scatter(x, y, s=25, alpha=0.1) |
319 | - ax[pos].set_xlim((-0.1,1.1)) | 326 | + ax[pos].set_xlim((-0.1, 1.1)) |
320 | - ax[pos].set_ylim((-0.1,1.1)) | 327 | + ax[pos].set_ylim((-0.1, 1.1)) |
321 | - ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer)-skipped)+" RNAs)", (0.08,0.15)) | 328 | + ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10) |
329 | + ax[pos].annotate("(" + str(len(points)) + '/' + str(len(RNAcontainer)-skipped) + " RNAs)", (0.08, 0.15)) | ||
322 | ax[pos].set_xlabel(xlabel) | 330 | ax[pos].set_xlabel(xlabel) |
323 | ax[pos].set_ylabel(ylabel) | 331 | ax[pos].set_ylabel(ylabel) |
324 | 332 | ||
325 | if nsolutions: | 333 | if nsolutions: |
326 | - ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar') | 334 | + ax[pos + 1].hist(sizes, bins=range(0, max(sizes) + 1, 2), histtype='bar') |
327 | - ax[pos+1].set_xlim((0,max(sizes)+2)) | 335 | + ax[pos + 1].set_xlim((0, max(sizes) + 2)) |
328 | - ax[pos+1].set_xticks(range(0, max(sizes), 10)) | 336 | + ax[pos + 1].set_xticks(range(0, max(sizes), 10)) |
329 | - ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90) | 337 | + ax[pos + 1].set_xticklabels(range(0, max(sizes), 10), rotation=90) |
330 | - ax[pos+1].set_xlabel("# solutions") | 338 | + ax[pos + 1].set_xlabel("# solutions") |
331 | - ax[pos+1].set_ylabel("# RNAs") | 339 | + ax[pos + 1].set_ylabel("# RNAs") |
340 | + | ||
332 | 341 | ||
333 | if __name__ == "__main__": | 342 | if __name__ == "__main__": |
334 | try: | 343 | try: | ... | ... |
scripts/pareto_visualizer_json.png
0 → 100644
40.4 KB
scripts/pareto_visualizer_json.py
0 → 100644
This diff is collapsed. Click to expand it.
scripts/pareto_visualizer_json_1.png
0 → 100644
34.9 KB
56.4 KB
70.5 KB
52.5 KB
scripts/selecting_id.cpp
0 → 100644
1 | +#include <iostream> | ||
2 | +#include <sstream> | ||
3 | +#include <fstream> | ||
4 | +#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp" | ||
5 | +#include <typeinfo> | ||
6 | +#include <set> | ||
7 | +#include <algorithm> | ||
8 | +#include <cstdio> | ||
9 | +#include <vector> | ||
10 | + | ||
11 | +using namespace std; | ||
12 | +using json = nlohmann::json; | ||
13 | + | ||
14 | +/* | ||
15 | +That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb). | ||
16 | +*/ | ||
17 | + | ||
18 | +//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence. | ||
19 | +struct data { | ||
20 | + //the pdb code (in the name of the sequence) | ||
21 | + string pdb; | ||
22 | + //the complete sequence with this pdb code | ||
23 | + string seq_pdb; | ||
24 | + //the id of the motif corresponding to this pdb in the library | ||
25 | + string id; | ||
26 | + //the module sequence with the components of this motif with the above id | ||
27 | + string cmp; | ||
28 | +}; | ||
29 | +typedef struct data data; | ||
30 | + | ||
31 | +//returns the list of pdb codes and the corresponding information from the benchmark file. | ||
32 | +vector<data> get_list_pdb_benchmark(const string& benchmark) { | ||
33 | + | ||
34 | + fstream bm(benchmark); | ||
35 | + vector<data> list_pdb_seq; | ||
36 | + if (bm.is_open()) { | ||
37 | + string name; | ||
38 | + string sequence; | ||
39 | + string structure; | ||
40 | + string contacts; | ||
41 | + | ||
42 | + while (getline(bm, name)) { | ||
43 | + data d; | ||
44 | + int size = name.size(); | ||
45 | + name = name.substr(5,size-6); | ||
46 | + getline(bm, sequence); | ||
47 | + d.pdb = name; | ||
48 | + d.seq_pdb = sequence; | ||
49 | + list_pdb_seq.push_back(d); | ||
50 | + | ||
51 | + getline(bm, structure); | ||
52 | + getline(bm, contacts); | ||
53 | + } | ||
54 | + bm.close(); | ||
55 | + } | ||
56 | + return list_pdb_seq; | ||
57 | +} | ||
58 | + | ||
59 | +string trim(string str) { | ||
60 | + int size = str.size(); | ||
61 | + str = str.substr(1, size-2); | ||
62 | + return str; | ||
63 | +} | ||
64 | + | ||
65 | +//store the corresponding id and motif to the sequence from the benchmark file | ||
66 | +data find_id_pattern(string& pdb_pattern, const string& benchmark) { | ||
67 | + vector<data> l = get_list_pdb_benchmark(benchmark); | ||
68 | + int size = l.size(); | ||
69 | + | ||
70 | + for (data d : l) { | ||
71 | + string cmp = d.pdb; | ||
72 | + cmp = cmp.substr(0, d.pdb.size()-2); | ||
73 | + if (!cmp.compare(pdb_pattern)) { | ||
74 | + return d; | ||
75 | + } | ||
76 | + } | ||
77 | + return data(); | ||
78 | +} | ||
79 | + | ||
80 | +//Create an array of data ('association'), which consists of each pdb of the benchmark file | ||
81 | +// with the associated pattern from this sequence. | ||
82 | +vector<data> find_id(const string& bibli, const string& benchmark) { | ||
83 | + ifstream lib(bibli); | ||
84 | + json js = json::parse(lib); | ||
85 | + | ||
86 | + //nam seq_bm et id seq_id | ||
87 | + vector<data> association; | ||
88 | + | ||
89 | + for (auto it = js.begin(); it != js.end(); ++it) { | ||
90 | + string id = it.key(); | ||
91 | + data d; | ||
92 | + | ||
93 | + for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { | ||
94 | + string field = it2.key(); | ||
95 | + string seq; | ||
96 | + if (!field.compare("pdb")) { | ||
97 | + int n = js[id][field].size(); | ||
98 | + for (int i = 0; i < n ; i++) { | ||
99 | + ostringstream stream; | ||
100 | + stream << js[id][field][i]; | ||
101 | + string pdb = trim(stream.str()); | ||
102 | + | ||
103 | + d = find_id_pattern(pdb, benchmark); | ||
104 | + } | ||
105 | + } | ||
106 | + | ||
107 | + if (!field.compare("sequence")) { | ||
108 | + seq = it2.value(); | ||
109 | + | ||
110 | + if (!(d.pdb.empty())) { | ||
111 | + d.id = id; | ||
112 | + d.cmp = seq; | ||
113 | + association.push_back(d); | ||
114 | + } | ||
115 | + } | ||
116 | + } | ||
117 | + } | ||
118 | + lib.close(); | ||
119 | + cout << association.size() << endl; | ||
120 | + return association; | ||
121 | +} | ||
122 | + | ||
123 | +//check if the motif is found matching with a complete sequence from a benchmark file. | ||
124 | +bool does_it_match(const string& seq, const string& seq_motif) { | ||
125 | + size_t found = seq_motif.find("&"); | ||
126 | + size_t size = seq_motif.size(); | ||
127 | + vector<string> list_cmp; | ||
128 | + if (found != std::string::npos) { | ||
129 | + int count = 1; | ||
130 | + | ||
131 | + string cmp = seq_motif.substr(0, found); | ||
132 | + list_cmp.push_back(cmp); | ||
133 | + while(found != std::string::npos) { | ||
134 | + size_t begin = found; | ||
135 | + found = seq_motif.find("&", found + 1); | ||
136 | + cmp = seq_motif.substr(begin+1, found-begin-1); | ||
137 | + list_cmp.push_back(cmp); | ||
138 | + count++; | ||
139 | + } | ||
140 | + | ||
141 | + found = seq.find(list_cmp[0]); | ||
142 | + int count2 = 1; | ||
143 | + while((found != std::string::npos) && (count2 < count)) { | ||
144 | + size_t begin = found; | ||
145 | + found = seq.find(list_cmp[count2], found + 1); | ||
146 | + count2++; | ||
147 | + } | ||
148 | + | ||
149 | + if(count == count2) { | ||
150 | + return true; | ||
151 | + } | ||
152 | + | ||
153 | + } else { | ||
154 | + found = seq.find(seq_motif); | ||
155 | + if (found != std::string::npos) { | ||
156 | + return true; | ||
157 | + } | ||
158 | + } | ||
159 | + return false; | ||
160 | +} | ||
161 | + | ||
162 | +//return the list of motif id that didn't match with any other complete sequence than the one which it came from. | ||
163 | +vector<string> select_not_motif(const string& bibli, const string& benchmark) { | ||
164 | + vector<string> selection; | ||
165 | + vector<data> association = find_id(bibli, benchmark); | ||
166 | + | ||
167 | + for (data d : association) { | ||
168 | + selection.push_back(d.id); | ||
169 | + } | ||
170 | + | ||
171 | + for (data d : association) { | ||
172 | + for (data d2 : association) { | ||
173 | + string seq = d.seq_pdb; | ||
174 | + string seq2 = d2.cmp; | ||
175 | + bool test = false; | ||
176 | + | ||
177 | + if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) { | ||
178 | + test = does_it_match(seq, seq2); | ||
179 | + if (test) { | ||
180 | + cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl; | ||
181 | + auto position = find(selection.begin(), selection.end(), d.id); | ||
182 | + if (position != selection.end()) { | ||
183 | + int index = position - selection.begin(); | ||
184 | + selection.erase(selection.begin() + index); | ||
185 | + } | ||
186 | + } | ||
187 | + } | ||
188 | + } | ||
189 | + } | ||
190 | + sort(selection.begin(), selection.end() ); | ||
191 | + selection.erase(unique(selection.begin(), selection.end() ), selection.end() ); | ||
192 | + | ||
193 | + cout << "size: " << selection.size() << endl; | ||
194 | + | ||
195 | + return selection; | ||
196 | +} | ||
197 | + | ||
198 | +int main() | ||
199 | +{ | ||
200 | + string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json"; | ||
201 | + string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn"; | ||
202 | + | ||
203 | + /*vector<data> v = get_list_pdb_benchmark(benchmark); | ||
204 | + for (data d : v) { | ||
205 | + cout << d.pdb << ", " << d.seq_pdb << endl; | ||
206 | + }*/ | ||
207 | + | ||
208 | + /*string name = "1U6P_B"; | ||
209 | + data d = find_id_pattern(name, benchmark); | ||
210 | + cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/ | ||
211 | + | ||
212 | + /*vector<data> association = find_id(bibli, benchmark); | ||
213 | + for (data d : association) { | ||
214 | + cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl; | ||
215 | + }*/ | ||
216 | + | ||
217 | + /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU"; | ||
218 | + string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U"; | ||
219 | + bool test = does_it_match(seq, seq_motif); | ||
220 | + cout << test << endl;*/ | ||
221 | + | ||
222 | + vector<string> selection = select_not_motif(bibli, benchmark); | ||
223 | + for (string str : selection) { | ||
224 | + cout << str << ", "; | ||
225 | + } | ||
226 | + cout << endl; | ||
227 | + | ||
228 | + return 0; | ||
229 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
scripts/stats.py
0 → 100644
This diff is collapsed. Click to expand it.
scripts/temp/test.fa
0 → 100644
-
Please register or login to post a comment