Repository cleaning

Louis BECQUEY
Commit d73e4a4c81b1bdd987727835d3a62c3318136565 d73e4a4c 1 parent 62b200be
Showing 27 changed files with 7 additions and 499 deletions
.dockerignore
.gitignore
benchmark.py
data/modules/ISAURE/Readme.md
data/modules/ISAURE/benchmark.dbn
data/modules/ISAURE/benchmark.json
data/modules/ISAURE/benchmark.txt
data/modules/ISAURE/benchmark_16-07-2021.json
data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json
data/modules/ISAURE/motifs_01-06-2021.json
data/modules/ISAURE/motifs_06-06-2021.json
data/modules/ISAURE/motifs_28-05-2021.json
data/modules/ISAURE/motifs_final.json
rna1999.dG
scripts/add_delimiter.cpp
scripts/benchmark.py
scripts/count_pattern.cpp
scripts/create_files.cpp
scripts/delete_same_pdb.cpp
scripts/pareto_visualizer_json.png
--- a/.dockerignore
View file @d73e4a4
+++ b/.dockerignore
View file @d73e4a4
 results_*
+ results/
 build_BiORSEO_docker_image_ubuntu18.sh
 deploy_BiORSEO_docker_image_linux.sh
 INSTALL.md
 Readme.md
 benchmark_results/
- doc/
+ *.gz
+ *.pickle
+ log_of_the_run.sh
\ No newline at end of file
--- a/.gitignore
View file @d73e4a4
+++ b/.gitignore
View file @d73e4a4
@@ -3,9 +3,6 @@
 # Docker installation temporary files
 eigen-eigen-323c052e1731
 cplex_installer_12.8_Student.bin
- BayesPairing/
- BayesPairing2/
- ViennaRNA-2.4.13
 
 # Compiled Object files
 obj/*
@@ -22,8 +19,9 @@ log_of_the_run.sh
 logBadDesc.txt
 gurobi.log
 temp/*
- biorseo_results/*
 nohup.out
+ *.gz
+ *.pickle
 
 # data 
 data/modules/BGSU
@@ -32,4 +30,4 @@ data/modules/RIN
 data/modules/ISAURE
 data/sec_structs/bpRNA-1m_90.dbn
 data/sec_structs/pseudobase++.dbn
- data/fasta/contacts
+ data/fasta/contacts/
\ No newline at end of file
--- a/benchmark.py
View file @d73e4a4
+++ b/benchmark.py
View file @d73e4a4
--- a/data/modules/ISAURE/Readme.md deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/Readme.md deleted 100644 → 0
View file @62b200b
- The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA 
- laboratory. These motifs are made up of RNA fragments linked to proteins.
- ==================================================================================================================
- 
- Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
- The current scripts were created based on this file, and doesn't work with the other older libraries.
- 
- There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
- It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one 
- contains 130 RNA.
- 
- The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'. 
- They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
- 
- The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
- the 'motifs_06-06-2021.json' motifs file.
- This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A 
- is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
- 
- The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
- the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
- 
- 
- 
- 
- 
- 
--- a/data/modules/ISAURE/benchmark.dbn deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.dbn deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark.txt deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.txt deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark_16-07-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark_16-07-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_01-06-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_01-06-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_06-06-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_06-06-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_28-05-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_28-05-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_final.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_final.json deleted 100644 → 0
View file @62b200b
--- a/rna1999.dG deleted 100644 → 0
View file @62b200b
+++ b/rna1999.dG deleted 100644 → 0
View file @62b200b
--- a/scripts/add_delimiter.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/add_delimiter.cpp deleted 100644 → 0
View file @62b200b
- #include <iostream>
- #include <sstream>
- #include <fstream>
- #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
- #include <typeinfo>
- #include <set>
- #include <algorithm>
- #include <cstdio>
- #include <vector>
- 
- using namespace std;
- using json = nlohmann::json;
- 
- //Count the number of '&' in the motif sequence
- size_t count_delimiter(string& seq) {
-     size_t count = 0;
-     for(uint i = 0; i < seq.size(); i++) {
-         char c = seq.at(i);
-         if (c == '&') {
-             count++;
-         }
-     }
-     return count;
- }
- 
- /*
- If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts', 
- th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
- */
- void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
-     std::ifstream lib(jsonfile);
-     
-     std::ofstream outfile (jsonoutfile);
-     json new_motif;
-     json new_id;
- 
-     json js = json::parse(lib);
-     
-     //the list of pfam lists of the motif we want to count the inclusion in other motif
-     for (auto it = js.begin(); it != js.end(); ++it) {
-         string id = it.key();
-         string test;
-         string sequence;
-         string contacts;
-         bool is_change = false;
- 
-         //cout << "id: " << id << endl;
-         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-             test = it2.key();
- 
-             if (!test.compare("sequence")) {
-                 //cout << "sequence: " << it2.value() << endl;
-                 sequence = it2.value();
-                 new_id[test] = it2.value();
-   
-             } else if (!test.compare("contacts") ) {
-                 contacts = it2.value();
-             } else {
-                 new_id[test] = it2.value();
-             }  
-         }
-         string tmp = "";
-         if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
-             for (uint i = 0; i < sequence.size(); i++) {
-                 if (sequence.at(i) == '&') {
-                     tmp += "&";
-                 } else {
-                     tmp += contacts.at(i);
-                 }
-             }
-         } else {
-             tmp = contacts;
-         }
-         new_id["contacts"] = tmp;
-         new_motif[id] = new_id;
-         new_id.clear();
-     }
-     outfile << new_motif.dump(4) << endl;
-     outfile.close();
-     
- }
- 
- int main()
- {
-     string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
-     string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
-     add_delimiter(jsonfile, out);
-     return 0;
- }
-     
--- a/scripts/benchmark.py deleted 100755 → 0
View file @62b200b
+++ b/scripts/benchmark.py deleted 100755 → 0
View file @62b200b
--- a/scripts/count_pattern.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/count_pattern.cpp deleted 100644 → 0
View file @62b200b
--- a/scripts/create_files.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/create_files.cpp deleted 100644 → 0
View file @62b200b
- #include <iostream>
- #include <sstream>
- #include <fstream>
- #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
- #include <typeinfo>
- #include <set>
- #include <algorithm>
- #include <cstdio>
- #include <vector>
- 
- using namespace std;
- using json = nlohmann::json;
- 
- /*
- Create a .fasta file for each of the sequence inside the benchmark in json format.
- Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
- Those files are useful for the Isaure_benchmark.py script.
- */
- void create_files(const string& jsonmotifs) {
-     std::ifstream lib(jsonmotifs);
-     string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
-     string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
-     string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
-     std::ofstream outlist (list);
-     std::ofstream outdbn (dbn);
-     json js = json::parse(lib);
-     uint count = 0;
- 
-     for (auto it = js.begin(); it != js.end(); ++it) {    
-         string id = it.key();
-         string name, seq, contacts, structure;
-         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-             string chain = it2.key();
-             if (chain.compare("pfams") != 0) {
-                 string name = id + "_" + chain;
-                 string filename = fasta + name + ".fa";
-                 std::ofstream outfasta (filename);
-                 outfasta << ">test_" << name << endl;
-                 for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {     
-                     string field = it3.key();
-                     if (!field.compare("sequence")) {
-                         seq = it3.value();
-                         outfasta << seq.substr(0,seq.size()) << endl;
-                         outfasta.close();
- 
-                     } else if (!field.compare("contacts")) {
-                         contacts = it3.value();
- 
-                     } else if (!field.compare("struct2d")) {
-                         structure = it3.value();
-                     }
-                 }
-                 if(seq.find('&') == string::npos) {
-                     outlist << ">test_" << name << endl;
-                     outdbn << "test_" << name << "." << endl;
-                     outlist << contacts << endl;
-                     outdbn << seq << endl;
-                     outdbn << structure << endl;
-                     outdbn << contacts << endl;
-                     outlist << seq << endl;
-                     outlist << structure << endl;      
-                     count++;       
-                 }
-             }
-         }
-     }
-     cout << count << " sequences en tout" << endl;
-     lib.close();
-     outlist.close();
-     outdbn.close();
- }
- 
- int main()
- {
-     string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
-     string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json"; 
-     create_files(jsonbm);
- 
-     return 0;
- }
-     
--- a/scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @62b200b
- #include <iostream>
- #include <sstream>
- #include <fstream>
- #include "/local/local/BiorseoNath/cppsrc/json.hpp"
- #include <typeinfo>
- #include <set>
- #include <algorithm>
- #include <cstdio>
- #include <vector>
- #include <string>
- 
- using namespace std;
- using json = nlohmann::json;
- 
- /*
- This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
- with BiORSEO.
- */
- void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
-     std::ifstream lib(jsonlibrary);
-     
-     std::ofstream outfile (jsonoutfile);
-     json new_motif;
-     json new_id;
-     json js = json::parse(lib);
-     
-     for (auto it = js.begin(); it != js.end(); ++it) {
-         string id = it.key();
-         vector<string> list_pdbs;
-         bool is_added = true;
- 
-         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-             string field = it2.key();   
- 
-             if (!field.compare("pdb")) {
-                 vector<string> tab = it2.value();
-                 list_pdbs = tab;
-             } else {
-                 new_id[field] = it2.value();
-             }  
-         }
- 
-         if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
-             is_added = false;
-         }
-         if (is_added) {      
-             new_id["pdb"] = list_pdbs;     
-             new_motif[id] = new_id;
-         }
-         new_id.clear();
-     }
-     outfile << new_motif.dump(4) << endl;
-     outfile.close(); 
- }
- 
- int main(int argc, char** argv)
- {
-     string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
-     string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
-     string name = argv[1];
-     delete_redundant_pdb(jsonlibrary, name, out);
-     return 0;
- }
-     
--- a/scripts/pareto_visualizer_json.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_1.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_1.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MEA_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MEA_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MFE_MEA_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MFE_MEA_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MFE_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MFE_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/selecting_id.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/selecting_id.cpp deleted 100644 → 0
View file @62b200b
- #include <iostream>
- #include <sstream>
- #include <fstream>
- #include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
- #include <typeinfo>
- #include <set>
- #include <algorithm>
- #include <cstdio>
- #include <vector>
- 
- using namespace std;
- using json = nlohmann::json;
- 
- /*
- That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
- */
- 
- //To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
- struct data { 
-     //the pdb code (in the name of the sequence)
-     string pdb;
-     //the complete sequence with this pdb code
-     string seq_pdb;
-     //the id of the motif corresponding to this pdb in the library
-     string id;
-     //the module sequence with the components of this motif with the above id
-     string cmp;
- };
- typedef struct data data;
- 
- //returns the list of pdb codes and the corresponding information from the benchmark file.
- vector<data> get_list_pdb_benchmark(const string& benchmark) {
- 
-     fstream bm(benchmark);
-     vector<data> list_pdb_seq;
-     if (bm.is_open()) {
-         string name;
-         string sequence;
-         string structure;
-         string contacts;
- 
-         while (getline(bm, name)) {
-             data d;
-             int size = name.size();
-             name = name.substr(5,size-6); 
-             getline(bm, sequence);
-             d.pdb = name;
-             d.seq_pdb = sequence;
-             list_pdb_seq.push_back(d);
- 
-             getline(bm, structure);
-             getline(bm, contacts);
-         }
-         bm.close();
-     }
-     return list_pdb_seq;
- }
- 
- string trim(string str) {
-     int size = str.size();
-     str = str.substr(1, size-2);
-     return str;
- }
- 
- //store the corresponding id and motif to the sequence from the benchmark file
- data find_id_pattern(string& pdb_pattern, const string& benchmark) {
-     vector<data> l = get_list_pdb_benchmark(benchmark);
-     int size = l.size();
- 
-     for (data d : l) {
-         string cmp = d.pdb;
-         cmp = cmp.substr(0, d.pdb.size()-2);
-         if (!cmp.compare(pdb_pattern)) {
-             return d;
-         }
-     }
-     return data();
- }
- 
- //Create an array of data ('association'), which consists of each pdb of the benchmark file
- // with the associated pattern from this sequence.
- vector<data> find_id(const string& bibli, const string& benchmark) {
-     ifstream lib(bibli);
-     json js = json::parse(lib);
- 
-     //nam seq_bm et id seq_id
-     vector<data> association;
-     
-     for (auto it = js.begin(); it != js.end(); ++it) {  
-         string id = it.key();
-         data d;
- 
-         for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { 
-             string field = it2.key();
-             string seq;
-             if (!field.compare("pdb")) {
-                 int n = js[id][field].size();
-                 for (int i = 0; i < n ; i++) {
-                     ostringstream stream;
-                     stream << js[id][field][i];
-                     string pdb = trim(stream.str());
-                     
-                     d = find_id_pattern(pdb, benchmark);
-                 }
-             }
- 
-             if (!field.compare("sequence")) {
-                 seq = it2.value();
- 
-                 if (!(d.pdb.empty())) {                    
-                     d.id = id;
-                     d.cmp = seq;
-                     association.push_back(d);
-                 }
-             }
-         }
-     }
-     lib.close();
-     cout << association.size() << endl;
-     return association;
- }
- 
- //check if the motif is found matching with a complete sequence from a benchmark file.
- bool does_it_match(const string& seq, const string& seq_motif) {
-     size_t found = seq_motif.find("&");
-     size_t size = seq_motif.size();
-     vector<string> list_cmp;
-     if (found != std::string::npos) {
-         int count = 1;
-         
-         string cmp = seq_motif.substr(0, found);
-         list_cmp.push_back(cmp);
-         while(found != std::string::npos) {
-             size_t begin = found;
-             found = seq_motif.find("&", found + 1);
-             cmp = seq_motif.substr(begin+1, found-begin-1);
-             list_cmp.push_back(cmp);
-             count++;
-         }
- 
-         found = seq.find(list_cmp[0]);
-         int count2 = 1;
-         while((found != std::string::npos) && (count2 < count)) {
-             size_t begin = found;
-             found = seq.find(list_cmp[count2], found + 1);
-             count2++;
-         }
- 
-         if(count == count2) {
-             return true;
-         }
- 
-     } else {
-         found = seq.find(seq_motif);
-         if (found != std::string::npos) {
-             return true;
-         }
-     }
-     return false;
- }
- 
- //return the list of motif id that didn't match with any other complete sequence than the one which it came from.
- vector<string> select_not_motif(const string& bibli, const string& benchmark) {
-     vector<string> selection;
-     vector<data> association = find_id(bibli, benchmark);
- 
-     for (data d : association) {
-         selection.push_back(d.id);
-     }
- 
-     for (data d : association) {
-         for (data d2 : association) {
-             string seq = d.seq_pdb;
-             string seq2 = d2.cmp;
-             bool test = false;
- 
-             if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
-                 test = does_it_match(seq, seq2);
-                 if (test) {
-                     cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
-                     auto position = find(selection.begin(), selection.end(), d.id);
-                     if (position != selection.end()) {
-                         int index = position - selection.begin();
-                         selection.erase(selection.begin() + index);
-                     }
-                 }
-             }
-         }
-     }
-     sort(selection.begin(), selection.end() );
-     selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
- 
-     cout << "size: " << selection.size() << endl;
- 
-     return selection;
- }
- 
- int main()
- {
-     string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
-     string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
- 
-     /*vector<data> v = get_list_pdb_benchmark(benchmark);
-     for (data d : v) {
-         cout << d.pdb << ", " << d.seq_pdb << endl;
-     }*/
- 
-     /*string name = "1U6P_B";
-     data d = find_id_pattern(name, benchmark);
-     cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
- 
-     /*vector<data> association = find_id(bibli, benchmark);
-     for (data d : association) {
-         cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
-     }*/
- 
-     /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
-     string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
-     bool test = does_it_match(seq, seq_motif);
-     cout << test << endl;*/
- 
-     vector<string> selection = select_not_motif(bibli, benchmark);
-     for (string str : selection) {
-         cout << str << ", ";
-     }
-     cout << endl;
- 
-     return 0;
- }
\ No newline at end of file
--- a/scripts/stats.py deleted 100644 → 0
View file @62b200b
+++ b/scripts/stats.py deleted 100644 → 0
View file @62b200b
--- a/scripts/temp/test.fa deleted 100644 → 0
View file @62b200b
+++ b/scripts/temp/test.fa deleted 100644 → 0
View file @62b200b
- >test
- CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG