Louis BECQUEY

Repository cleaning

results_*
results/
build_BiORSEO_docker_image_ubuntu18.sh
deploy_BiORSEO_docker_image_linux.sh
INSTALL.md
Readme.md
benchmark_results/
doc/
*.gz
*.pickle
log_of_the_run.sh
\ No newline at end of file
......
......@@ -3,9 +3,6 @@
# Docker installation temporary files
eigen-eigen-323c052e1731
cplex_installer_12.8_Student.bin
BayesPairing/
BayesPairing2/
ViennaRNA-2.4.13
# Compiled Object files
obj/*
......@@ -22,8 +19,9 @@ log_of_the_run.sh
logBadDesc.txt
gurobi.log
temp/*
biorseo_results/*
nohup.out
*.gz
*.pickle
# data
data/modules/BGSU
......@@ -32,4 +30,4 @@ data/modules/RIN
data/modules/ISAURE
data/sec_structs/bpRNA-1m_90.dbn
data/sec_structs/pseudobase++.dbn
data/fasta/contacts
data/fasta/contacts/
\ No newline at end of file
......
This diff is collapsed. Click to expand it.
The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA
laboratory. These motifs are made up of RNA fragments linked to proteins.
==================================================================================================================
Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
The current scripts were created based on this file, and doesn't work with the other older libraries.
There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one
contains 130 RNA.
The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'.
They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
the 'motifs_06-06-2021.json' motifs file.
This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A
is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
//Count the number of '&' in the motif sequence
size_t count_delimiter(string& seq) {
size_t count = 0;
for(uint i = 0; i < seq.size(); i++) {
char c = seq.at(i);
if (c == '&') {
count++;
}
}
return count;
}
/*
If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts',
th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
*/
void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
std::ifstream lib(jsonfile);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
//the list of pfam lists of the motif we want to count the inclusion in other motif
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
string test;
string sequence;
string contacts;
bool is_change = false;
//cout << "id: " << id << endl;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
test = it2.key();
if (!test.compare("sequence")) {
//cout << "sequence: " << it2.value() << endl;
sequence = it2.value();
new_id[test] = it2.value();
} else if (!test.compare("contacts") ) {
contacts = it2.value();
} else {
new_id[test] = it2.value();
}
}
string tmp = "";
if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
for (uint i = 0; i < sequence.size(); i++) {
if (sequence.at(i) == '&') {
tmp += "&";
} else {
tmp += contacts.at(i);
}
}
} else {
tmp = contacts;
}
new_id["contacts"] = tmp;
new_motif[id] = new_id;
new_id.clear();
}
outfile << new_motif.dump(4) << endl;
outfile.close();
}
int main()
{
string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
add_delimiter(jsonfile, out);
return 0;
}
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
/*
Create a .fasta file for each of the sequence inside the benchmark in json format.
Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
Those files are useful for the Isaure_benchmark.py script.
*/
void create_files(const string& jsonmotifs) {
std::ifstream lib(jsonmotifs);
string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
std::ofstream outlist (list);
std::ofstream outdbn (dbn);
json js = json::parse(lib);
uint count = 0;
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
string name, seq, contacts, structure;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string chain = it2.key();
if (chain.compare("pfams") != 0) {
string name = id + "_" + chain;
string filename = fasta + name + ".fa";
std::ofstream outfasta (filename);
outfasta << ">test_" << name << endl;
for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {
string field = it3.key();
if (!field.compare("sequence")) {
seq = it3.value();
outfasta << seq.substr(0,seq.size()) << endl;
outfasta.close();
} else if (!field.compare("contacts")) {
contacts = it3.value();
} else if (!field.compare("struct2d")) {
structure = it3.value();
}
}
if(seq.find('&') == string::npos) {
outlist << ">test_" << name << endl;
outdbn << "test_" << name << "." << endl;
outlist << contacts << endl;
outdbn << seq << endl;
outdbn << structure << endl;
outdbn << contacts << endl;
outlist << seq << endl;
outlist << structure << endl;
count++;
}
}
}
}
cout << count << " sequences en tout" << endl;
lib.close();
outlist.close();
outdbn.close();
}
int main()
{
string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json";
create_files(jsonbm);
return 0;
}
#include <iostream>
#include <sstream>
#include <fstream>
#include "/local/local/BiorseoNath/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
#include <string>
using namespace std;
using json = nlohmann::json;
/*
This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
with BiORSEO.
*/
void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
std::ifstream lib(jsonlibrary);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
vector<string> list_pdbs;
bool is_added = true;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string field = it2.key();
if (!field.compare("pdb")) {
vector<string> tab = it2.value();
list_pdbs = tab;
} else {
new_id[field] = it2.value();
}
}
if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
is_added = false;
}
if (is_added) {
new_id["pdb"] = list_pdbs;
new_motif[id] = new_id;
}
new_id.clear();
}
outfile << new_motif.dump(4) << endl;
outfile.close();
}
int main(int argc, char** argv)
{
string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
string name = argv[1];
delete_redundant_pdb(jsonlibrary, name, out);
return 0;
}
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
/*
That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
*/
//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
struct data {
//the pdb code (in the name of the sequence)
string pdb;
//the complete sequence with this pdb code
string seq_pdb;
//the id of the motif corresponding to this pdb in the library
string id;
//the module sequence with the components of this motif with the above id
string cmp;
};
typedef struct data data;
//returns the list of pdb codes and the corresponding information from the benchmark file.
vector<data> get_list_pdb_benchmark(const string& benchmark) {
fstream bm(benchmark);
vector<data> list_pdb_seq;
if (bm.is_open()) {
string name;
string sequence;
string structure;
string contacts;
while (getline(bm, name)) {
data d;
int size = name.size();
name = name.substr(5,size-6);
getline(bm, sequence);
d.pdb = name;
d.seq_pdb = sequence;
list_pdb_seq.push_back(d);
getline(bm, structure);
getline(bm, contacts);
}
bm.close();
}
return list_pdb_seq;
}
string trim(string str) {
int size = str.size();
str = str.substr(1, size-2);
return str;
}
//store the corresponding id and motif to the sequence from the benchmark file
data find_id_pattern(string& pdb_pattern, const string& benchmark) {
vector<data> l = get_list_pdb_benchmark(benchmark);
int size = l.size();
for (data d : l) {
string cmp = d.pdb;
cmp = cmp.substr(0, d.pdb.size()-2);
if (!cmp.compare(pdb_pattern)) {
return d;
}
}
return data();
}
//Create an array of data ('association'), which consists of each pdb of the benchmark file
// with the associated pattern from this sequence.
vector<data> find_id(const string& bibli, const string& benchmark) {
ifstream lib(bibli);
json js = json::parse(lib);
//nam seq_bm et id seq_id
vector<data> association;
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
data d;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string field = it2.key();
string seq;
if (!field.compare("pdb")) {
int n = js[id][field].size();
for (int i = 0; i < n ; i++) {
ostringstream stream;
stream << js[id][field][i];
string pdb = trim(stream.str());
d = find_id_pattern(pdb, benchmark);
}
}
if (!field.compare("sequence")) {
seq = it2.value();
if (!(d.pdb.empty())) {
d.id = id;
d.cmp = seq;
association.push_back(d);
}
}
}
}
lib.close();
cout << association.size() << endl;
return association;
}
//check if the motif is found matching with a complete sequence from a benchmark file.
bool does_it_match(const string& seq, const string& seq_motif) {
size_t found = seq_motif.find("&");
size_t size = seq_motif.size();
vector<string> list_cmp;
if (found != std::string::npos) {
int count = 1;
string cmp = seq_motif.substr(0, found);
list_cmp.push_back(cmp);
while(found != std::string::npos) {
size_t begin = found;
found = seq_motif.find("&", found + 1);
cmp = seq_motif.substr(begin+1, found-begin-1);
list_cmp.push_back(cmp);
count++;
}
found = seq.find(list_cmp[0]);
int count2 = 1;
while((found != std::string::npos) && (count2 < count)) {
size_t begin = found;
found = seq.find(list_cmp[count2], found + 1);
count2++;
}
if(count == count2) {
return true;
}
} else {
found = seq.find(seq_motif);
if (found != std::string::npos) {
return true;
}
}
return false;
}
//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
vector<string> select_not_motif(const string& bibli, const string& benchmark) {
vector<string> selection;
vector<data> association = find_id(bibli, benchmark);
for (data d : association) {
selection.push_back(d.id);
}
for (data d : association) {
for (data d2 : association) {
string seq = d.seq_pdb;
string seq2 = d2.cmp;
bool test = false;
if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
test = does_it_match(seq, seq2);
if (test) {
cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
auto position = find(selection.begin(), selection.end(), d.id);
if (position != selection.end()) {
int index = position - selection.begin();
selection.erase(selection.begin() + index);
}
}
}
}
}
sort(selection.begin(), selection.end() );
selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
cout << "size: " << selection.size() << endl;
return selection;
}
int main()
{
string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
/*vector<data> v = get_list_pdb_benchmark(benchmark);
for (data d : v) {
cout << d.pdb << ", " << d.seq_pdb << endl;
}*/
/*string name = "1U6P_B";
data d = find_id_pattern(name, benchmark);
cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
/*vector<data> association = find_id(bibli, benchmark);
for (data d : association) {
cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
}*/
/*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
bool test = does_it_match(seq, seq_motif);
cout << test << endl;*/
vector<string> selection = select_not_motif(bibli, benchmark);
for (string str : selection) {
cout << str << ", ";
}
cout << endl;
return 0;
}
\ No newline at end of file
This diff is collapsed. Click to expand it.
>test
CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG