Repository cleaning

Louis BECQUEY
Commit d73e4a4c81b1bdd987727835d3a62c3318136565 d73e4a4c 1 parent 62b200be
Showing 27 changed files with 7 additions and 499 deletions
.dockerignore
.gitignore
benchmark.py
data/modules/ISAURE/Readme.md
data/modules/ISAURE/benchmark.dbn
data/modules/ISAURE/benchmark.json
data/modules/ISAURE/benchmark.txt
data/modules/ISAURE/benchmark_16-07-2021.json
data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json
data/modules/ISAURE/motifs_01-06-2021.json
data/modules/ISAURE/motifs_06-06-2021.json
data/modules/ISAURE/motifs_28-05-2021.json
data/modules/ISAURE/motifs_final.json
rna1999.dG
scripts/add_delimiter.cpp
scripts/benchmark.py
scripts/count_pattern.cpp
scripts/create_files.cpp
scripts/delete_same_pdb.cpp
scripts/pareto_visualizer_json.png
--- a/.dockerignore
View file @d73e4a4
+++ b/.dockerignore
View file @d73e4a4
 results_*
+results/
 build_BiORSEO_docker_image_ubuntu18.sh
 deploy_BiORSEO_docker_image_linux.sh
 INSTALL.md
 Readme.md
 benchmark_results/
-doc/
+*.gz
+*.pickle
+log_of_the_run.sh
\ No newline at end of file
--- a/.gitignore
View file @d73e4a4
+++ b/.gitignore
View file @d73e4a4
@@ -3,9 +3,6 @@
 # Docker installation temporary files
 eigen-eigen-323c052e1731
 cplex_installer_12.8_Student.bin
-BayesPairing/
-BayesPairing2/
-ViennaRNA-2.4.13
 # Compiled Object files
 obj/*
@@ -22,8 +19,9 @@ log_of_the_run.sh
 logBadDesc.txt
 gurobi.log
 temp/*
-biorseo_results/*
 nohup.out
+*.gz
+*.pickle
 # data 
 data/modules/BGSU
@@ -32,4 +30,4 @@ data/modules/RIN
 data/modules/ISAURE
 data/sec_structs/bpRNA-1m_90.dbn
 data/sec_structs/pseudobase++.dbn
-data/fasta/contacts
+data/fasta/contacts/
\ No newline at end of file
--- a/benchmark.py
View file @d73e4a4
+++ b/benchmark.py
View file @d73e4a4
--- a/data/modules/ISAURE/Readme.md deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/Readme.md deleted 100644 → 0
View file @62b200b
-The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA 
-laboratory. These motifs are made up of RNA fragments linked to proteins.
-==================================================================================================================
-
-Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
-The current scripts were created based on this file, and doesn't work with the other older libraries.
-
-There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
-It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one 
-contains 130 RNA.
-
-The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'. 
-They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
-
-The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
-the 'motifs_06-06-2021.json' motifs file.
-This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A 
-is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
-
-The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
-the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
-
-
-
-
-
-
--- a/data/modules/ISAURE/benchmark.dbn deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.dbn deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark.txt deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark.txt deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/benchmark_16-07-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/benchmark_16-07-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_01-06-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_01-06-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_06-06-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_06-06-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_28-05-2021.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_28-05-2021.json deleted 100644 → 0
View file @62b200b
--- a/data/modules/ISAURE/motifs_final.json deleted 100644 → 0
View file @62b200b
+++ b/data/modules/ISAURE/motifs_final.json deleted 100644 → 0
View file @62b200b
--- a/rna1999.dG deleted 100644 → 0
View file @62b200b
+++ b/rna1999.dG deleted 100644 → 0
View file @62b200b
--- a/scripts/add_delimiter.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/add_delimiter.cpp deleted 100644 → 0
View file @62b200b
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
-#include <typeinfo>
-#include <set>
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-
-using namespace std;
-using json = nlohmann::json;
-
-//Count the number of '&' in the motif sequence
-size_t count_delimiter(string& seq) {
-    size_t count = 0;
-    for(uint i = 0; i < seq.size(); i++) {
-        char c = seq.at(i);
-        if (c == '&') {
-            count++;
-        }
-    }
-    return count;
-}
-
-/*
-If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts', 
-th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
-*/
-void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
-    std::ifstream lib(jsonfile);
-    
-    std::ofstream outfile (jsonoutfile);
-    json new_motif;
-    json new_id;
-
-    json js = json::parse(lib);
-    
-    //the list of pfam lists of the motif we want to count the inclusion in other motif
-    for (auto it = js.begin(); it != js.end(); ++it) {
-        string id = it.key();
-        string test;
-        string sequence;
-        string contacts;
-        bool is_change = false;
-
-        //cout << "id: " << id << endl;
-        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-            test = it2.key();
-
-            if (!test.compare("sequence")) {
-                //cout << "sequence: " << it2.value() << endl;
-                sequence = it2.value();
-                new_id[test] = it2.value();
-  
-            } else if (!test.compare("contacts") ) {
-                contacts = it2.value();
-            } else {
-                new_id[test] = it2.value();
-            }  
-        }
-        string tmp = "";
-        if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
-            for (uint i = 0; i < sequence.size(); i++) {
-                if (sequence.at(i) == '&') {
-                    tmp += "&";
-                } else {
-                    tmp += contacts.at(i);
-                }
-            }
-        } else {
-            tmp = contacts;
-        }
-        new_id["contacts"] = tmp;
-        new_motif[id] = new_id;
-        new_id.clear();
-    }
-    outfile << new_motif.dump(4) << endl;
-    outfile.close();
-    
-}
-
-int main()
-{
-    string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
-    string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
-    add_delimiter(jsonfile, out);
-    return 0;
-}
-    
--- a/scripts/benchmark.py deleted 100755 → 0
View file @62b200b
+++ b/scripts/benchmark.py deleted 100755 → 0
View file @62b200b
--- a/scripts/count_pattern.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/count_pattern.cpp deleted 100644 → 0
View file @62b200b
--- a/scripts/create_files.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/create_files.cpp deleted 100644 → 0
View file @62b200b
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
-#include <typeinfo>
-#include <set>
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-
-using namespace std;
-using json = nlohmann::json;
-
-/*
-Create a .fasta file for each of the sequence inside the benchmark in json format.
-Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
-Those files are useful for the Isaure_benchmark.py script.
-*/
-void create_files(const string& jsonmotifs) {
-    std::ifstream lib(jsonmotifs);
-    string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
-    string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
-    string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
-    std::ofstream outlist (list);
-    std::ofstream outdbn (dbn);
-    json js = json::parse(lib);
-    uint count = 0;
-
-    for (auto it = js.begin(); it != js.end(); ++it) {    
-        string id = it.key();
-        string name, seq, contacts, structure;
-        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-            string chain = it2.key();
-            if (chain.compare("pfams") != 0) {
-                string name = id + "_" + chain;
-                string filename = fasta + name + ".fa";
-                std::ofstream outfasta (filename);
-                outfasta << ">test_" << name << endl;
-                for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {     
-                    string field = it3.key();
-                    if (!field.compare("sequence")) {
-                        seq = it3.value();
-                        outfasta << seq.substr(0,seq.size()) << endl;
-                        outfasta.close();
-
-                    } else if (!field.compare("contacts")) {
-                        contacts = it3.value();
-
-                    } else if (!field.compare("struct2d")) {
-                        structure = it3.value();
-                    }
-                }
-                if(seq.find('&') == string::npos) {
-                    outlist << ">test_" << name << endl;
-                    outdbn << "test_" << name << "." << endl;
-                    outlist << contacts << endl;
-                    outdbn << seq << endl;
-                    outdbn << structure << endl;
-                    outdbn << contacts << endl;
-                    outlist << seq << endl;
-                    outlist << structure << endl;      
-                    count++;       
-                }
-            }
-        }
-    }
-    cout << count << " sequences en tout" << endl;
-    lib.close();
-    outlist.close();
-    outdbn.close();
-}
-
-int main()
-{
-    string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
-    string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json"; 
-    create_files(jsonbm);
-
-    return 0;
-}
-    
--- a/scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/delete_same_pdb.cpp deleted 100644 → 0
View file @62b200b
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include "/local/local/BiorseoNath/cppsrc/json.hpp"
-#include <typeinfo>
-#include <set>
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-#include <string>
-
-using namespace std;
-using json = nlohmann::json;
-
-/*
-This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
-with BiORSEO.
-*/
-void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
-    std::ifstream lib(jsonlibrary);
-    
-    std::ofstream outfile (jsonoutfile);
-    json new_motif;
-    json new_id;
-    json js = json::parse(lib);
-    
-    for (auto it = js.begin(); it != js.end(); ++it) {
-        string id = it.key();
-        vector<string> list_pdbs;
-        bool is_added = true;
-
-        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {      
-            string field = it2.key();   
-
-            if (!field.compare("pdb")) {
-                vector<string> tab = it2.value();
-                list_pdbs = tab;
-            } else {
-                new_id[field] = it2.value();
-            }  
-        }
-
-        if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
-            is_added = false;
-        }
-        if (is_added) {      
-            new_id["pdb"] = list_pdbs;     
-            new_motif[id] = new_id;
-        }
-        new_id.clear();
-    }
-    outfile << new_motif.dump(4) << endl;
-    outfile.close(); 
-}
-
-int main(int argc, char** argv)
-{
-    string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
-    string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
-    string name = argv[1];
-    delete_redundant_pdb(jsonlibrary, name, out);
-    return 0;
-}
-    
--- a/scripts/pareto_visualizer_json.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_1.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_1.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MEA_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MEA_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MFE_MEA_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MFE_MEA_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/pareto_visualizer_json_MFE_functionE.png deleted 100644 → 0
View file @62b200b
+++ b/scripts/pareto_visualizer_json_MFE_functionE.png deleted 100644 → 0
View file @62b200b
--- a/scripts/selecting_id.cpp deleted 100644 → 0
View file @62b200b
+++ b/scripts/selecting_id.cpp deleted 100644 → 0
View file @62b200b
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
-#include <typeinfo>
-#include <set>
-#include <algorithm>
-#include <cstdio>
-#include <vector>
-
-using namespace std;
-using json = nlohmann::json;
-
-/*
-That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
-*/
-
-//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
-struct data { 
-    //the pdb code (in the name of the sequence)
-    string pdb;
-    //the complete sequence with this pdb code
-    string seq_pdb;
-    //the id of the motif corresponding to this pdb in the library
-    string id;
-    //the module sequence with the components of this motif with the above id
-    string cmp;
-};
-typedef struct data data;
-
-//returns the list of pdb codes and the corresponding information from the benchmark file.
-vector<data> get_list_pdb_benchmark(const string& benchmark) {
-
-    fstream bm(benchmark);
-    vector<data> list_pdb_seq;
-    if (bm.is_open()) {
-        string name;
-        string sequence;
-        string structure;
-        string contacts;
-
-        while (getline(bm, name)) {
-            data d;
-            int size = name.size();
-            name = name.substr(5,size-6); 
-            getline(bm, sequence);
-            d.pdb = name;
-            d.seq_pdb = sequence;
-            list_pdb_seq.push_back(d);
-
-            getline(bm, structure);
-            getline(bm, contacts);
-        }
-        bm.close();
-    }
-    return list_pdb_seq;
-}
-
-string trim(string str) {
-    int size = str.size();
-    str = str.substr(1, size-2);
-    return str;
-}
-
-//store the corresponding id and motif to the sequence from the benchmark file
-data find_id_pattern(string& pdb_pattern, const string& benchmark) {
-    vector<data> l = get_list_pdb_benchmark(benchmark);
-    int size = l.size();
-
-    for (data d : l) {
-        string cmp = d.pdb;
-        cmp = cmp.substr(0, d.pdb.size()-2);
-        if (!cmp.compare(pdb_pattern)) {
-            return d;
-        }
-    }
-    return data();
-}
-
-//Create an array of data ('association'), which consists of each pdb of the benchmark file
-// with the associated pattern from this sequence.
-vector<data> find_id(const string& bibli, const string& benchmark) {
-    ifstream lib(bibli);
-    json js = json::parse(lib);
-
-    //nam seq_bm et id seq_id
-    vector<data> association;
-    
-    for (auto it = js.begin(); it != js.end(); ++it) {  
-        string id = it.key();
-        data d;
-
-        for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) { 
-            string field = it2.key();
-            string seq;
-            if (!field.compare("pdb")) {
-                int n = js[id][field].size();
-                for (int i = 0; i < n ; i++) {
-                    ostringstream stream;
-                    stream << js[id][field][i];
-                    string pdb = trim(stream.str());
-                    
-                    d = find_id_pattern(pdb, benchmark);
-                }
-            }
-
-            if (!field.compare("sequence")) {
-                seq = it2.value();
-
-                if (!(d.pdb.empty())) {                    
-                    d.id = id;
-                    d.cmp = seq;
-                    association.push_back(d);
-                }
-            }
-        }
-    }
-    lib.close();
-    cout << association.size() << endl;
-    return association;
-}
-
-//check if the motif is found matching with a complete sequence from a benchmark file.
-bool does_it_match(const string& seq, const string& seq_motif) {
-    size_t found = seq_motif.find("&");
-    size_t size = seq_motif.size();
-    vector<string> list_cmp;
-    if (found != std::string::npos) {
-        int count = 1;
-        
-        string cmp = seq_motif.substr(0, found);
-        list_cmp.push_back(cmp);
-        while(found != std::string::npos) {
-            size_t begin = found;
-            found = seq_motif.find("&", found + 1);
-            cmp = seq_motif.substr(begin+1, found-begin-1);
-            list_cmp.push_back(cmp);
-            count++;
-        }
-
-        found = seq.find(list_cmp[0]);
-        int count2 = 1;
-        while((found != std::string::npos) && (count2 < count)) {
-            size_t begin = found;
-            found = seq.find(list_cmp[count2], found + 1);
-            count2++;
-        }
-
-        if(count == count2) {
-            return true;
-        }
-
-    } else {
-        found = seq.find(seq_motif);
-        if (found != std::string::npos) {
-            return true;
-        }
-    }
-    return false;
-}
-
-//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
-vector<string> select_not_motif(const string& bibli, const string& benchmark) {
-    vector<string> selection;
-    vector<data> association = find_id(bibli, benchmark);
-
-    for (data d : association) {
-        selection.push_back(d.id);
-    }
-
-    for (data d : association) {
-        for (data d2 : association) {
-            string seq = d.seq_pdb;
-            string seq2 = d2.cmp;
-            bool test = false;
-
-            if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
-                test = does_it_match(seq, seq2);
-                if (test) {
-                    cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
-                    auto position = find(selection.begin(), selection.end(), d.id);
-                    if (position != selection.end()) {
-                        int index = position - selection.begin();
-                        selection.erase(selection.begin() + index);
-                    }
-                }
-            }
-        }
-    }
-    sort(selection.begin(), selection.end() );
-    selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
-
-    cout << "size: " << selection.size() << endl;
-
-    return selection;
-}
-
-int main()
-{
-    string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
-    string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
-
-    /*vector<data> v = get_list_pdb_benchmark(benchmark);
-    for (data d : v) {
-        cout << d.pdb << ", " << d.seq_pdb << endl;
-    }*/
-
-    /*string name = "1U6P_B";
-    data d = find_id_pattern(name, benchmark);
-    cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
-
-    /*vector<data> association = find_id(bibli, benchmark);
-    for (data d : association) {
-        cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
-    }*/
-
-    /*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
-    string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
-    bool test = does_it_match(seq, seq_motif);
-    cout << test << endl;*/
-
-    vector<string> selection = select_not_motif(bibli, benchmark);
-    for (string str : selection) {
-        cout << str << ", ";
-    }
-    cout << endl;
-
-    return 0;
-}
\ No newline at end of file
--- a/scripts/stats.py deleted 100644 → 0
View file @62b200b
+++ b/scripts/stats.py deleted 100644 → 0
View file @62b200b
--- a/scripts/temp/test.fa deleted 100644 → 0
View file @62b200b
+++ b/scripts/temp/test.fa deleted 100644 → 0
View file @62b200b
->test
-CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG