Louis BECQUEY

Repository cleaning

1 results_* 1 results_*
2 +results/
2 build_BiORSEO_docker_image_ubuntu18.sh 3 build_BiORSEO_docker_image_ubuntu18.sh
3 deploy_BiORSEO_docker_image_linux.sh 4 deploy_BiORSEO_docker_image_linux.sh
4 INSTALL.md 5 INSTALL.md
5 Readme.md 6 Readme.md
6 benchmark_results/ 7 benchmark_results/
7 -doc/ 8 +*.gz
9 +*.pickle
10 +log_of_the_run.sh
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -3,9 +3,6 @@ ...@@ -3,9 +3,6 @@
3 # Docker installation temporary files 3 # Docker installation temporary files
4 eigen-eigen-323c052e1731 4 eigen-eigen-323c052e1731
5 cplex_installer_12.8_Student.bin 5 cplex_installer_12.8_Student.bin
6 -BayesPairing/
7 -BayesPairing2/
8 -ViennaRNA-2.4.13
9 6
10 # Compiled Object files 7 # Compiled Object files
11 obj/* 8 obj/*
...@@ -22,8 +19,9 @@ log_of_the_run.sh ...@@ -22,8 +19,9 @@ log_of_the_run.sh
22 logBadDesc.txt 19 logBadDesc.txt
23 gurobi.log 20 gurobi.log
24 temp/* 21 temp/*
25 -biorseo_results/*
26 nohup.out 22 nohup.out
23 +*.gz
24 +*.pickle
27 25
28 # data 26 # data
29 data/modules/BGSU 27 data/modules/BGSU
...@@ -32,4 +30,4 @@ data/modules/RIN ...@@ -32,4 +30,4 @@ data/modules/RIN
32 data/modules/ISAURE 30 data/modules/ISAURE
33 data/sec_structs/bpRNA-1m_90.dbn 31 data/sec_structs/bpRNA-1m_90.dbn
34 data/sec_structs/pseudobase++.dbn 32 data/sec_structs/pseudobase++.dbn
35 -data/fasta/contacts 33 +data/fasta/contacts/
...\ No newline at end of file ...\ No newline at end of file
......
This diff is collapsed. Click to expand it.
1 -The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA
2 -laboratory. These motifs are made up of RNA fragments linked to proteins.
3 -==================================================================================================================
4 -
5 -Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
6 -The current scripts were created based on this file, and doesn't work with the other older libraries.
7 -
8 -There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
9 -It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one
10 -contains 130 RNA.
11 -
12 -The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'.
13 -They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
14 -
15 -The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
16 -the 'motifs_06-06-2021.json' motifs file.
17 -This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A
18 -is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
19 -
20 -The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
21 -the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
22 -
23 -
24 -
25 -
26 -
27 -
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
1 -#include <iostream>
2 -#include <sstream>
3 -#include <fstream>
4 -#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 -#include <typeinfo>
6 -#include <set>
7 -#include <algorithm>
8 -#include <cstdio>
9 -#include <vector>
10 -
11 -using namespace std;
12 -using json = nlohmann::json;
13 -
14 -//Count the number of '&' in the motif sequence
15 -size_t count_delimiter(string& seq) {
16 - size_t count = 0;
17 - for(uint i = 0; i < seq.size(); i++) {
18 - char c = seq.at(i);
19 - if (c == '&') {
20 - count++;
21 - }
22 - }
23 - return count;
24 -}
25 -
26 -/*
27 -If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts',
28 -th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
29 -*/
30 -void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
31 - std::ifstream lib(jsonfile);
32 -
33 - std::ofstream outfile (jsonoutfile);
34 - json new_motif;
35 - json new_id;
36 -
37 - json js = json::parse(lib);
38 -
39 - //the list of pfam lists of the motif we want to count the inclusion in other motif
40 - for (auto it = js.begin(); it != js.end(); ++it) {
41 - string id = it.key();
42 - string test;
43 - string sequence;
44 - string contacts;
45 - bool is_change = false;
46 -
47 - //cout << "id: " << id << endl;
48 - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
49 - test = it2.key();
50 -
51 - if (!test.compare("sequence")) {
52 - //cout << "sequence: " << it2.value() << endl;
53 - sequence = it2.value();
54 - new_id[test] = it2.value();
55 -
56 - } else if (!test.compare("contacts") ) {
57 - contacts = it2.value();
58 - } else {
59 - new_id[test] = it2.value();
60 - }
61 - }
62 - string tmp = "";
63 - if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
64 - for (uint i = 0; i < sequence.size(); i++) {
65 - if (sequence.at(i) == '&') {
66 - tmp += "&";
67 - } else {
68 - tmp += contacts.at(i);
69 - }
70 - }
71 - } else {
72 - tmp = contacts;
73 - }
74 - new_id["contacts"] = tmp;
75 - new_motif[id] = new_id;
76 - new_id.clear();
77 - }
78 - outfile << new_motif.dump(4) << endl;
79 - outfile.close();
80 -
81 -}
82 -
83 -int main()
84 -{
85 - string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
86 - string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
87 - add_delimiter(jsonfile, out);
88 - return 0;
89 -}
90 -
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 -#include <iostream>
2 -#include <sstream>
3 -#include <fstream>
4 -#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 -#include <typeinfo>
6 -#include <set>
7 -#include <algorithm>
8 -#include <cstdio>
9 -#include <vector>
10 -
11 -using namespace std;
12 -using json = nlohmann::json;
13 -
14 -/*
15 -Create a .fasta file for each of the sequence inside the benchmark in json format.
16 -Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
17 -Those files are useful for the Isaure_benchmark.py script.
18 -*/
19 -void create_files(const string& jsonmotifs) {
20 - std::ifstream lib(jsonmotifs);
21 - string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
22 - string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
23 - string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
24 - std::ofstream outlist (list);
25 - std::ofstream outdbn (dbn);
26 - json js = json::parse(lib);
27 - uint count = 0;
28 -
29 - for (auto it = js.begin(); it != js.end(); ++it) {
30 - string id = it.key();
31 - string name, seq, contacts, structure;
32 - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
33 - string chain = it2.key();
34 - if (chain.compare("pfams") != 0) {
35 - string name = id + "_" + chain;
36 - string filename = fasta + name + ".fa";
37 - std::ofstream outfasta (filename);
38 - outfasta << ">test_" << name << endl;
39 - for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {
40 - string field = it3.key();
41 - if (!field.compare("sequence")) {
42 - seq = it3.value();
43 - outfasta << seq.substr(0,seq.size()) << endl;
44 - outfasta.close();
45 -
46 - } else if (!field.compare("contacts")) {
47 - contacts = it3.value();
48 -
49 - } else if (!field.compare("struct2d")) {
50 - structure = it3.value();
51 - }
52 - }
53 - if(seq.find('&') == string::npos) {
54 - outlist << ">test_" << name << endl;
55 - outdbn << "test_" << name << "." << endl;
56 - outlist << contacts << endl;
57 - outdbn << seq << endl;
58 - outdbn << structure << endl;
59 - outdbn << contacts << endl;
60 - outlist << seq << endl;
61 - outlist << structure << endl;
62 - count++;
63 - }
64 - }
65 - }
66 - }
67 - cout << count << " sequences en tout" << endl;
68 - lib.close();
69 - outlist.close();
70 - outdbn.close();
71 -}
72 -
73 -int main()
74 -{
75 - string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
76 - string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json";
77 - create_files(jsonbm);
78 -
79 - return 0;
80 -}
81 -
1 -#include <iostream>
2 -#include <sstream>
3 -#include <fstream>
4 -#include "/local/local/BiorseoNath/cppsrc/json.hpp"
5 -#include <typeinfo>
6 -#include <set>
7 -#include <algorithm>
8 -#include <cstdio>
9 -#include <vector>
10 -#include <string>
11 -
12 -using namespace std;
13 -using json = nlohmann::json;
14 -
15 -/*
16 -This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
17 -with BiORSEO.
18 -*/
19 -void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
20 - std::ifstream lib(jsonlibrary);
21 -
22 - std::ofstream outfile (jsonoutfile);
23 - json new_motif;
24 - json new_id;
25 - json js = json::parse(lib);
26 -
27 - for (auto it = js.begin(); it != js.end(); ++it) {
28 - string id = it.key();
29 - vector<string> list_pdbs;
30 - bool is_added = true;
31 -
32 - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
33 - string field = it2.key();
34 -
35 - if (!field.compare("pdb")) {
36 - vector<string> tab = it2.value();
37 - list_pdbs = tab;
38 - } else {
39 - new_id[field] = it2.value();
40 - }
41 - }
42 -
43 - if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
44 - is_added = false;
45 - }
46 - if (is_added) {
47 - new_id["pdb"] = list_pdbs;
48 - new_motif[id] = new_id;
49 - }
50 - new_id.clear();
51 - }
52 - outfile << new_motif.dump(4) << endl;
53 - outfile.close();
54 -}
55 -
56 -int main(int argc, char** argv)
57 -{
58 - string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
59 - string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
60 - string name = argv[1];
61 - delete_redundant_pdb(jsonlibrary, name, out);
62 - return 0;
63 -}
64 -
1 -#include <iostream>
2 -#include <sstream>
3 -#include <fstream>
4 -#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
5 -#include <typeinfo>
6 -#include <set>
7 -#include <algorithm>
8 -#include <cstdio>
9 -#include <vector>
10 -
11 -using namespace std;
12 -using json = nlohmann::json;
13 -
14 -/*
15 -That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
16 -*/
17 -
18 -//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
19 -struct data {
20 - //the pdb code (in the name of the sequence)
21 - string pdb;
22 - //the complete sequence with this pdb code
23 - string seq_pdb;
24 - //the id of the motif corresponding to this pdb in the library
25 - string id;
26 - //the module sequence with the components of this motif with the above id
27 - string cmp;
28 -};
29 -typedef struct data data;
30 -
31 -//returns the list of pdb codes and the corresponding information from the benchmark file.
32 -vector<data> get_list_pdb_benchmark(const string& benchmark) {
33 -
34 - fstream bm(benchmark);
35 - vector<data> list_pdb_seq;
36 - if (bm.is_open()) {
37 - string name;
38 - string sequence;
39 - string structure;
40 - string contacts;
41 -
42 - while (getline(bm, name)) {
43 - data d;
44 - int size = name.size();
45 - name = name.substr(5,size-6);
46 - getline(bm, sequence);
47 - d.pdb = name;
48 - d.seq_pdb = sequence;
49 - list_pdb_seq.push_back(d);
50 -
51 - getline(bm, structure);
52 - getline(bm, contacts);
53 - }
54 - bm.close();
55 - }
56 - return list_pdb_seq;
57 -}
58 -
59 -string trim(string str) {
60 - int size = str.size();
61 - str = str.substr(1, size-2);
62 - return str;
63 -}
64 -
65 -//store the corresponding id and motif to the sequence from the benchmark file
66 -data find_id_pattern(string& pdb_pattern, const string& benchmark) {
67 - vector<data> l = get_list_pdb_benchmark(benchmark);
68 - int size = l.size();
69 -
70 - for (data d : l) {
71 - string cmp = d.pdb;
72 - cmp = cmp.substr(0, d.pdb.size()-2);
73 - if (!cmp.compare(pdb_pattern)) {
74 - return d;
75 - }
76 - }
77 - return data();
78 -}
79 -
80 -//Create an array of data ('association'), which consists of each pdb of the benchmark file
81 -// with the associated pattern from this sequence.
82 -vector<data> find_id(const string& bibli, const string& benchmark) {
83 - ifstream lib(bibli);
84 - json js = json::parse(lib);
85 -
86 - //nam seq_bm et id seq_id
87 - vector<data> association;
88 -
89 - for (auto it = js.begin(); it != js.end(); ++it) {
90 - string id = it.key();
91 - data d;
92 -
93 - for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
94 - string field = it2.key();
95 - string seq;
96 - if (!field.compare("pdb")) {
97 - int n = js[id][field].size();
98 - for (int i = 0; i < n ; i++) {
99 - ostringstream stream;
100 - stream << js[id][field][i];
101 - string pdb = trim(stream.str());
102 -
103 - d = find_id_pattern(pdb, benchmark);
104 - }
105 - }
106 -
107 - if (!field.compare("sequence")) {
108 - seq = it2.value();
109 -
110 - if (!(d.pdb.empty())) {
111 - d.id = id;
112 - d.cmp = seq;
113 - association.push_back(d);
114 - }
115 - }
116 - }
117 - }
118 - lib.close();
119 - cout << association.size() << endl;
120 - return association;
121 -}
122 -
123 -//check if the motif is found matching with a complete sequence from a benchmark file.
124 -bool does_it_match(const string& seq, const string& seq_motif) {
125 - size_t found = seq_motif.find("&");
126 - size_t size = seq_motif.size();
127 - vector<string> list_cmp;
128 - if (found != std::string::npos) {
129 - int count = 1;
130 -
131 - string cmp = seq_motif.substr(0, found);
132 - list_cmp.push_back(cmp);
133 - while(found != std::string::npos) {
134 - size_t begin = found;
135 - found = seq_motif.find("&", found + 1);
136 - cmp = seq_motif.substr(begin+1, found-begin-1);
137 - list_cmp.push_back(cmp);
138 - count++;
139 - }
140 -
141 - found = seq.find(list_cmp[0]);
142 - int count2 = 1;
143 - while((found != std::string::npos) && (count2 < count)) {
144 - size_t begin = found;
145 - found = seq.find(list_cmp[count2], found + 1);
146 - count2++;
147 - }
148 -
149 - if(count == count2) {
150 - return true;
151 - }
152 -
153 - } else {
154 - found = seq.find(seq_motif);
155 - if (found != std::string::npos) {
156 - return true;
157 - }
158 - }
159 - return false;
160 -}
161 -
162 -//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
163 -vector<string> select_not_motif(const string& bibli, const string& benchmark) {
164 - vector<string> selection;
165 - vector<data> association = find_id(bibli, benchmark);
166 -
167 - for (data d : association) {
168 - selection.push_back(d.id);
169 - }
170 -
171 - for (data d : association) {
172 - for (data d2 : association) {
173 - string seq = d.seq_pdb;
174 - string seq2 = d2.cmp;
175 - bool test = false;
176 -
177 - if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
178 - test = does_it_match(seq, seq2);
179 - if (test) {
180 - cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
181 - auto position = find(selection.begin(), selection.end(), d.id);
182 - if (position != selection.end()) {
183 - int index = position - selection.begin();
184 - selection.erase(selection.begin() + index);
185 - }
186 - }
187 - }
188 - }
189 - }
190 - sort(selection.begin(), selection.end() );
191 - selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
192 -
193 - cout << "size: " << selection.size() << endl;
194 -
195 - return selection;
196 -}
197 -
198 -int main()
199 -{
200 - string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
201 - string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
202 -
203 - /*vector<data> v = get_list_pdb_benchmark(benchmark);
204 - for (data d : v) {
205 - cout << d.pdb << ", " << d.seq_pdb << endl;
206 - }*/
207 -
208 - /*string name = "1U6P_B";
209 - data d = find_id_pattern(name, benchmark);
210 - cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
211 -
212 - /*vector<data> association = find_id(bibli, benchmark);
213 - for (data d : association) {
214 - cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
215 - }*/
216 -
217 - /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
218 - string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
219 - bool test = does_it_match(seq, seq_motif);
220 - cout << test << endl;*/
221 -
222 - vector<string> selection = select_not_motif(bibli, benchmark);
223 - for (string str : selection) {
224 - cout << str << ", ";
225 - }
226 - cout << endl;
227 -
228 - return 0;
229 -}
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
1 ->test
2 -CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG