Louis BECQUEY

Merge branch 'stage_NBernard' into 'master'

Stage n bernard results



See merge request !1
Showing 75 changed files with 805 additions and 241 deletions
.vscode/*
.vscode
# LaTeX temporary files
doc/*.toc
doc/*.bbl
doc/*.gz
doc/*.log
doc/*.aux
doc/*.blg
doc/*.fls
doc/*.fdb_latexmk
# Docker installation temporary files
eigen-eigen-323c052e1731
cplex_installer_12.8_Student.bin
......@@ -20,7 +9,6 @@ ViennaRNA-2.4.13
# Compiled Object files
obj/*
doc/*.pdf
data/modules/RIN/__pycache__
# Executables
......@@ -44,4 +32,4 @@ data/modules/RIN
data/modules/ISAURE
data/sec_structs/bpRNA-1m_90.dbn
data/sec_structs/pseudobase++.dbn
data/fasta/contacts
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
......@@ -9,7 +9,7 @@ CC = g++
CFLAGS = -Icppsrc/ -I/usr/local/include -I$(CPLEX)/concert/include -I$(CPLEX)/cplex/include -g -O3
CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-deprecated-copy -Wno-ignored-attributes
LINKER = g++
LDFLAGS = -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
LDFLAGS = -Wno-free-nonheap-object -L$(CPLEX)/concert/lib/x86-64_linux/static_pic/ -L$(CPLEX)/cplex/lib/x86-64_linux/static_pic/ -lboost_system -lboost_filesystem -lboost_program_options -lgomp -lconcert -lilocplex -lcplex -lpthread -ldl -lRNA -lm
# change these to proper directories where each file should be
SRCDIR = cppsrc
......@@ -31,20 +31,8 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES)
$(CC) -c $(CFLAGS) $(CXXFLAGS) $< -o $@
@echo -e "\033[00;32mCompiled "$<".\033[00m"
doc: mainpdf supppdf
@echo -e "\033[00;32mLaTeX documentation rendered.\033[00m"
mainpdf: doc/main_bioinformatics.tex doc/references.bib doc/bioinfo.cls doc/natbib.bst
cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
cd doc; bibtex main_bioinformatics
cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error main_bioinformatics
supppdf: doc/supplementary_material.tex
cd doc; pdflatex -synctex=1 -interaction=nonstopmode -file-line-error supplementary_material
.PHONY: all
all: $(BINDIR)/$(TARGET) doc
all: $(BINDIR)/$(TARGET)
.PHONY: re
re: remove clean all
......
......@@ -19,6 +19,7 @@ THEN
OUTPUT:
- A set of secondary structures from the Pareto front,
- The list of known modules inserted inplace in the corresponding structures
- A set of positions of the nucleotides in contact with the protein represented by asterisks (only if the motifs_28-05-2021.json library is used!)
2/ The different models
==================================
......@@ -28,7 +29,8 @@ Biorseo can be used with two modules datasets (yet):
* Rna3Dmotifs (from the work of *Djelloul & Denise, 2008*)
* The RNA 3D Motif Atlas of BGSU's RNA lab (*Petrov et al, 2013*, see http://rna.bgsu.edu/rna3dhub/motifs/)
* CaRNAval 1.0 (*Reinhartz et al, 2018*)
* RNA-Bricks 2, RNAMC, CaRNAval 2.0, and others could theoretically be used, but are not supported (yet). You might write your own API.
* /data/modules/ISAURE/motifs_28-05-2021.json a library of motifs from RNA linked to a protein from Isaure Chauvot de Beauchêne of LORIA laboratory
(contact:isaure.chauvot-de-beauchene@loria.fr)
PATTERN MATCHING STEP
- Use **simple pattern matching**. Rna3Dmotifs modules are available with sequence information. We use regular expressions to find those known loops in your query. This is the approach of RNA-MoIP (*Reinharz et al, 2012*), we deal the same way with short components and wildcards.
......@@ -43,6 +45,8 @@ OBJECTIVE FUNCTIONS FOR THE MODULE INSERTION CRITERIA
* **Function B** : weights a module by its number of components (strands) and penalizes it by the log^(_2) of its nucleotide size.
* **Function C** : weights a module by its insertion site score (JAR3D or BayesPairing score).
* **Function D** : weights a module by its number of components (strands) and insertion site score (JAR3D or BayesPairing score), and penalizes it by the log^(_2) of its nucleotide size.
* **Function E** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides in the module.
* **Function F** : weights a module by its nucleotides in contact with a protein, number of occurences and number of nucleotides along the entire length of the RNA.
3/ Installation
==================================
......@@ -55,22 +59,22 @@ Check the file [INSTALL.md](INSTALL.md) for installation instructions.
- If you **might expect a pseudoknot, or don't know**:
* The most promising method is the use of direct pattern matching with Rna3Dmotifs and function A. But this method is sometimes subject to combinatorial explosion issues. If you have a long RNA or a large number of loops, don't use it. Example:
`./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A`
`./biorseo.py -i PDB_00304.fa -O resultsFolder/ --rna3dmotifs --patternmatch --func A --MEA`
* The use of the RNA 3D Motif Atlas placed by JAR3D and scored with function A is not subject to combinatorial issues, but performs a bit worse. It also returns less solutions. Example:
`./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A
`./biorseo.py -i PDB_00304.fa -O resultsFolder/ --3dmotifatlas --jar3d --func A --MEA
5/ List of Options
==================================
```
Usage: You must provide:
1) a FASTA input file with -i,
2) a module type with --rna3dmotifs, --carnaval or --3dmotifatlas
2) a module type with --rna3dmotifs, --carnaval, --3dmotifatlas or --contacts
3) one module placement method in { --patternmatch, --jar3d, --bayespairing }
4) one scoring function with --func A, B, C or D
4) one scoring function with --func A, B, C, D, E ou F
5) one estimator betwenn --MEA or --MFE
If you are not using the Docker image:
5) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
6) --modules-path, --biorseo-dir and (--jar3d-exec or --bypdir)
Options:
-h [ --help ] Print this help message
......@@ -79,16 +83,21 @@ Options:
--rna3dmotifs Use DESC modules from Djelloul & Denise, 2008
--carnaval Use RIN modules from Reinharz & al, 2018
--3dmotifatlas Use the HL and IL loops from BGSU's 3D Motif Atlas (updated)
--contacts Use the library of motifs, created from RNA sequences linked to proteins provided by I. Chauvot de Beauchene of LORIA laboratory
-p [ --patternmatch ] Use regular expressions to place modules in the sequence (requires --rna3dmotifs or --carnaval)
-j [ --jar3d ] Use JAR3D to place modules in the sequence (requires --3dmotifatlas)
-b [ --bayespairing ] Use BayesPairing2 to place modules in the sequence (requires --rna3dmotifs or --3dmotifatlas)
-o [ --output=… ] File to summarize the results
-O [ --outputf=… ] Folder where to output result and temp files
-f [ --func=… ] (A, B, C or D, default is B) Objective function to score module insertions:
-f [ --func=… ] (A, B, C, D, E or F default is B) Objective function to score module insertions:
(A) insert big modules (B) insert light, high-order modules
(c) insert modules which score well with the sequence
(C) insert modules which score well with the sequence
(D) insert light, high-order modules which score well with the sequence.
C and D require cannot be used with --patternmatch.
C and D cannot be used with --patternmatch.
(E) and (F) insert modules with a lot of nucleotides and a lot of nucleotides in contact with a proteine, and a huge number of occurences.
(E) maximize the number of contact nucleotide inside the module, while (F) maximize the number of contact nucleotide along the entire length of the RNA.
--MEA Use Maximum Expected Accuracy for the second objective
--MFE Use Minimum Free Energy based on the formula of (*Legendre et al., 2018*) for the second objective
-c [ --first-objective=… ] (default 1) Objective to solve in the mono-objective portions of the algorithm.
(1) is the module objective given by --func, (2) is the expected accuracy of the structure.
-l [ --limit=… ] (default 500) Number of solutions in the Pareto set from which
......@@ -113,9 +122,9 @@ Options:
BiORSEO from outside the docker image. Use the FULL path.
Examples:
biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B
biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800
biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D
biorseo.py -i myRNA.fa -O myResultsFolder/ --rna3dmotifs --patternmatch --func B --MEA
biorseo.py -i myRNA.fa -O myResultsFolder/ --3dmotifatlas --jar3d --func B -l 800 --MEA
biorseo.py -i myRNA.fa -v --3dmotifatlas --bayespairing --func D --MEA
The allowed module/placement-method/function combinations are:
......@@ -123,5 +132,6 @@ The allowed module/placement-method/function combinations are:
--rna3dmotifs A. B. A. B. C. D.
--3dmotifatlas A. B. C. D. A. B. C. D.
--carnaval A. B.
--contacts E. F.
```
......
......@@ -29,11 +29,11 @@ import pickle
# ================== DEFINITION OF THE PATHS ==============================
biorseoDir = path.realpath(".")
jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
biokopdir = "/home/persalteas/Software/biokop/biokop"
moipdir = "/local/local/localopt/RNAMoIP/Src/RNAMoIP.py"
biokopdir = "/local/local/localopt/biokop/biokop"
runDir = path.dirname(path.realpath(__file__))
bpRNAFile = argv[1]
PseudobaseFile = argv[2]
......@@ -1109,8 +1109,11 @@ def load_from_dbn(file, header_style=3):
if not '(' in struct:
continue # ignore linear structures
if is_canonical_nts(seq) and is_canonical_bps(struct):
# keeps what's inside brackets at the end as the filename
if header_style == 1: container.append(RNA(header.replace('/', '_').split('(')[-1][:-1], header, seq, struct))
# keeps what's inside square brackets at the end as the filename
if header_style == 2: container.append(RNA(header.replace('/', '_').split('[')[-1][:-41], header, seq, struct))
# keeps all the header as filename
if header_style == 3: container.append(RNA(header[1:], header, seq, struct))
if '[' in struct: counter += 1
db.close()
......@@ -1475,8 +1478,8 @@ def print_StudyCase_results():
if __name__ == '__main__':
print("> Loading files...", flush=True)
bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile)
PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile)
bpRNAContainer, bpRNA_pk_counter = load_from_dbn(bpRNAFile, header_style=1)
PseudobaseContainer, Pseudobase_pk_counter = load_from_dbn(PseudobaseFile, header_style=3)
StudycaseContainer, StudyCase_pk_counter = load_from_dbn(StudyCaseFile, header_style=1)
for nt, number in ignored_nt_dict.items():
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
......@@ -37,6 +37,7 @@ class MOIP
void forbid_solutions_between(double min, double max);
IloEnv& get_env(void);
static char obj_function_nbr_; // On what criteria do you want to insert motifs ?
static char obj_function2_nbr_; // Do you want to use MEA or MFE to determine the best energy score ?
static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true)
......@@ -47,8 +48,12 @@ class MOIP
void define_problem_constraints(string& source);
size_t get_yuv_index(size_t u, size_t v) const;
size_t get_Cpxi_index(size_t x_i, size_t i_on_j) const;
size_t get_xij_index(size_t u, size_t v) const;
IloNumExprArg& y(size_t u, size_t v); // Direct reference to y^u_v in basepair_dv_
IloNumExprArg& C(size_t x, size_t i); // Direct reference to C_p^xi in insertion_dv_
IloNumExprArg& x(size_t u, size_t v); // Direct reference to x_i,j in stacks_dv_
bool exists_vertical_outdated_labels(const SecondaryStructure& s) const;
bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
void allowed_motifs_from_desc(args_of_parallel_func arg_struct);
......@@ -66,12 +71,16 @@ class MOIP
IloEnv env_; // environment CPLEX object
IloNumVarArray basepair_dv_; // Decision variables
IloNumVarArray insertion_dv_; // Decision variables
IloNumVarArray stacks_dv_; // Decision variables
IloModel model_; // Solver for objective 1
IloExpr obj1; // Objective function that counts inserted motifs
IloExpr obj2; // Objective function of expected accuracy
vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_
vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_
vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_
vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
};
inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); }
......@@ -79,6 +88,8 @@ inline uint MOIP::get_n_candidates(void) const { return ins
inline const SecondaryStructure& MOIP::solution(uint i) const { return pareto_[i]; }
inline IloNumExprArg& MOIP::y(size_t u, size_t v) { return basepair_dv_[get_yuv_index(u, v)]; }
inline IloNumExprArg& MOIP::C(size_t x, size_t i) { return insertion_dv_[get_Cpxi_index(x, i)]; }
inline IloNumExprArg& MOIP::x(size_t u, size_t v) { return stacks_dv_[get_xij_index(u, v)]; }
inline SecondaryStructure MOIP::solve_objective(int o) { return solve_objective(o, 0, rna_.get_RNA_length()); }
inline IloEnv& MOIP::get_env(void) { return env_; }
......
This diff is collapsed. Click to expand it.
......@@ -20,13 +20,7 @@ typedef struct Comp_ {
pair<uint, uint> pos;
size_t k;
string seq_;
uint nb_pairing;
Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
Comp_(pair<int, int> p, uint nb_pair) : pos(p)
{
k = 1 + pos.second - pos.first;
nb_pairing = nb_pair;
}
Comp_(uint start, uint length) : k(length)
{
pos.first = start;
......@@ -64,6 +58,7 @@ class Motif
string get_identifier(void) const;
vector<Component> comp;
vector<Link> links_;
vector<uint> pos_contacts;
size_t contact_;
double tx_occurrences_;
......@@ -89,7 +84,19 @@ vector<Motif> load_csv(const string& path);
vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose);
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc);
vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc, vector<string>& vs);
vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
// utilities for Json motifs
size_t count_nucleotide(string&);
size_t count_delimiter(string&);
size_t count_contacts(string&);
string check_motif_sequence(string);
bool checkSecondaryStructure(string);
vector<Link> build_motif_pairs(string&, vector<Component>&);
uint find_max_occurrences(string&);
uint find_max_sequence(string&);
vector<string> find_components(string&, string);
vector<uint> find_contacts(vector<string>&, vector<Component>&);
// utilities to compare secondary structures:
bool operator==(const Motif& m1, const Motif& m2);
......
This diff is collapsed. Click to expand it.
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
void delete_redundant_pdb(const string& jsonfile, const string& jsontest, const string& jsonoutfile) {
std::ifstream lib(jsonfile);
std::ifstream lib2(jsontest);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
json js2 = json::parse(lib2);
//the list of pfam lists of the motif we want to count the inclusion in other motif
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
vector<string> list_pdbs;
vector<string> list_pdbs2;
bool is_added = true;
//cout << "id: " << id << endl;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string test = it2.key();
if (!test.compare("pdb")) {
vector<string> tab = it2.value();
list_pdbs = tab;
/*set<set<string>>::iterator iit;
set<string>::iterator iit2;
for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
cout << *iit2 << endl;
}
cout << endl << endl;
}*/
} else {
new_id[test] = it2.value();
}
}
//cout << "-------begin---------" << endl;
for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
string id2 = it3.key();
//cout << "id: " << id << " / id2: " << id2 << endl;
for (auto it4 = js[id2].begin(); it4 != js[id2].end(); ++it4) {
string test = it4.key();
if (!test.compare("pdb")) {
vector<string> tab = it4.value();
list_pdbs2 = tab;
//cout << id << " / " << id2 << endl;
for (uint k = 0; k < list_pdbs2.size(); k++) {
if (count(list_pdbs.begin(), list_pdbs.end(), list_pdbs2[k])) {
is_added = false;
}
//cout << list_pdbs2[k] << endl;
}
}
}
//cout << endl;*/
}
/*for(uint ii = 0; ii < list_pfams.size(); ii++) {
for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
}
}*/
if (is_added) {
new_id["pdb"] = list_pdbs;
new_motif[id] = new_id;
}
new_id.clear();
//cout << "valeur: " << ite << endl;
/*for (uint i = 0; i < tab_struc.size() ; i++) {
cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
} */
}
outfile << new_motif.dump(4) << endl;
outfile.close();
}
int main()
{
string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/bibli_test2.json";
string jsontest = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark_test.json";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final_test.json";
delete_redundant_pdb(jsonfile, jsontest, out);
return 0;
}
......@@ -3,11 +3,13 @@
#include <algorithm>
#include <boost/format.hpp>
#define RESET "\033[0m"
#define RED "\033[31m" /* Red */
using std::abs;
using std::cout;
using std::endl;
SecondaryStructure::SecondaryStructure() {}
......@@ -98,6 +100,26 @@ string SecondaryStructure::to_DBN(void) const
return res;
}
string structure_with_contacts(const SecondaryStructure& ss) {
string sequence = ss.rna_.get_seq();
string construct = "";
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : ss.motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
construct += "*";
} else {
construct += ".";
}
}
return construct;
}
string SecondaryStructure::to_string(void) const
{
string s;
......@@ -119,13 +141,35 @@ void SecondaryStructure::set_basepair(uint i, uint j)
void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
void colored_contacts(string sequence, vector<Motif> motif_info_) {
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
cout << RED << sequence[i] << RESET;
} else {
cout << sequence[i];
}
}
}
void SecondaryStructure::print(void) const
{
cout << endl;
cout << '\t' << rna_.get_seq() << endl;
cout << '\t' << to_string() << endl;
cout << '\t';
colored_contacts(rna_.get_seq(), motif_info_);
//rna_.get_seq()
cout << endl;
string ss = to_string();
cout << '\t';
colored_contacts(ss, motif_info_);
//cout << ss;
cout << endl;
for (const Motif& m : motif_info_) {
uint i = 0;
cout << '\t';
......
......@@ -30,7 +30,6 @@ class SecondaryStructure
string to_DBN() const;
string to_string() const;
vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure
vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program
vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions
......@@ -58,5 +57,7 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
string structure_with_contacts(const SecondaryStructure& ss);
#endif // SECONDARY_STRUCTURE_
\ No newline at end of file
......
This diff is collapsed. Click to expand it.
No preview for this file type
......@@ -58,12 +58,49 @@ RNA::RNA(string name, string seq, bool verbose)
pij_(results->i-1,results->j-1) = results->p;
results++;
}
/*define type_*/
type_ = vector<vector<int>>(n_, vector<int>(n_));
for(uint i = 0; i < n_; i++){
for(uint j = 0; j < n_; j++){
if (i < j){
std::stringstream ss;
ss << seq_[i] << seq_[j];
std::string str = ss.str();
if(str.compare("AU") == 0 ){
type_[i][j] = 1;
}
else if(str.compare("CG") == 0 ){
type_[i][j] = 2;
}
else if(str.compare("GC") == 0 ){
type_[i][j] = 3;
}
else if(str.compare("GU") == 0 ){
type_[i][j] = 4;
}
else if(str.compare("UG") == 0 ){
type_[i][j] = 5;
}
else if(str.compare("UA") == 0 ){
type_[i][j] = 6;
}
else{
type_[i][j] = 0;
}
}
else{
type_[i][j] = 0;
}
}
}
}
else cerr << "NULL result returned by vrna_pfl_fold" << endl;
}
void RNA::print_basepair_p_matrix(float theta) const
{
cout << endl;
......
......@@ -32,6 +32,8 @@ class RNA
uint get_RNA_length(void) const;
void print_basepair_p_matrix(float theta) const;
vector<vector<int>> get_type();
bool verbose_; // Should we print things ?
private:
......@@ -41,10 +43,15 @@ class RNA
string seq_; // sequence of the rna with chars
uint n_; // length of the rna
MatrixXf pij_; // matrix of basepair probabilities
vector<vector<int>> type_; //vector of base pair types
};
inline float RNA::get_pij(int i, int j) { return pij_(i, j); }
inline uint RNA::get_RNA_length() const { return n_; }
inline string RNA::get_seq(void) const { return seq_; }
inline vector<vector<int>> RNA::get_type() { return type_; }
#endif
......
>__'CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE_'_(PDB_00376)
GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
>__'GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE_'_(PDB_01023)
GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
>__'SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA_'_(PDB_00857)
GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
>test_CRYSTAL_STRUCTURE_OF_A_TIGHT-BINDING_GLUTAMINE_TRNA_BOUND_TO_GLUTAMINE_AMINOACYL_TRNA_SYNTHETASE__PDB_00376
GGGGUAUCGCCAAGCGGUAAGGCACCGGAUUCUGAUUCCGGAGGUCGAGGUUCGAAUCCUCGUACCCCAGCCA
>test_GUANINE_RIBOSWITCH_U22C,_A52G_MUTANT_BOUND_TO_HYPOXANTHINE__PDB_01023
GGACAUACAAUCGCGUGGAUAUGGCACGCAAGUUUCUGCCGGGCACCGUAAAUGUCCGACUAUGUCCa
>test_SOLUTION_STRUCTURE_OF_THE_P2B-P3_PSEUDOKNOT_FROM_HUMAN_TELOMERASE_RNA__PDB_00857
GGGCUGUUUUUCUCGCUGACUUUCAGCCCCAAACAAAAAAGUCAGCA
\ No newline at end of file
......
File mode changed
> JSON1000_extended
AAUAUCCGGGCGUUUAAUCCCGGGAUAAA
\ No newline at end of file
The motif library used with --contacts is particular. It was provided by Isaure Chauvot de Beauchêne from the LORIA
laboratory. These motifs are made up of RNA fragments linked to proteins.
==================================================================================================================
Several versions of these designs have been provided, but the most complete is the latest:'motifs_06-06-2021.json'
The current scripts were created based on this file, and doesn't work with the other older libraries.
There is also 2 benchmarks files also in json format : 'benchmark_16-06-2021.json' and 'benchmark_16-07-2021.json'.
It contains complete RNA sequences that bind to a protein, the first one contains only 33 RNA, and the second one
contains 130 RNA.
The benchmark.dbn and benchmark.txt were created based on the 'benchmark_16-07-2021.json'.
They are mostly used for the Isaure_benchmark.py script and scripts from the 'scripts' directory.
The motifs_final.json it obtains after executing the count_pattern.cpp script in 'script' directory on
the 'motifs_06-06-2021.json' motifs file.
This script count the number of "occurrences" of the motif. So we consider that if the sequence of motif A
is included in motif B, then for each inclusion of B we also have an inclusion of A. And vice versa.
The motif library used by BiORSEO is the one in the 'bibliotheque_a_lire' directory. There should only be
the json file we wish to be used by BiORSEO for it's prediction. That's why you shouldn't put other type of file!
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
File mode changed
File mode changed
File mode changed
File mode changed
No preview for this file type
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
from math import sqrt, ceil
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
# Retrieve for each rna the best value for MEA and compare this energy value with the one obtains with
# RNAeval and RNAfold from the ViennaRNA Package 2.0 (Ronny Lorentz et al., 2011)
# After getting those values, it will creates a figure.
def get_result_MEA(filename):
ext = "json_pmE"
file2 = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/" + filename + ext, "r")
name = file2.readline()
rna = file2.readline()
twod = file2.readline()
pred = re.findall(r'\S+', twod)
score = '-' + pred[len(pred)-1]
min = float(score)
contacts = file2.readline()
while twod:
twod = file2.readline()
pred = re.findall(r'\S+', twod)
if len(pred) > 0:
score = '-' + pred[len(pred) - 1]
if float(score) < min:
min = float(score)
contacts = file2.readline()
file2.close()
return min
fileMFE = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAfold_bm.log", "r")
lineRna = fileMFE.readline()
lineStruct = fileMFE.readline()
fileEval = open( "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/RNAeval_bm.log", "r")
lineRna2 = fileEval.readline()
lineStruct2 = fileEval.readline()
file = open("/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn", "r")
name = file.readline().strip()
rna = file.readline()
twod = file.readline()
contacts = file.readline()
list_name = []
list_score = []
list_type = []
print(np)
while name:
#print(name)
if lineRna != rna:
while lineRna != rna:
lineRna = fileMFE.readline()
lineStruct = fileMFE.readline()
MFE = float(lineStruct[len(lineStruct)-8:len(lineStruct)-2])
list_name.append(name[5:len(name)-1])
list_score.append(MFE)
list_type.append('MFE')
#print("MFE:" + str(MFE))
lineRna = fileMFE.readline()
lineStruct = fileMFE.readline()
if lineRna2 != rna:
while lineRna2 != rna:
lineRna2 = fileEval.readline()
lineStruct2 = fileEval.readline()
eval = float(lineStruct2[len(lineStruct2)-8:len(lineStruct2)-2])
list_name.append(name[5:len(name) - 1])
list_score.append(eval)
list_type.append('eval')
#print("Eval:" + str(eval))
lineRna2 = fileEval.readline()
lineStruct2 = fileEval.readline()
best_mea = get_result_MEA(name)
#print("MEA: " + str(best_mea) + "\n")
list_name.append(name[5:len(name) - 1])
list_score.append(best_mea)
list_type.append('MEA')
name = file.readline().strip()
rna = file.readline()
twod = file.readline()
contacts = file.readline()
file.close()
fileMFE.close()
fileEval.close()
'''print(list_MFE)
print(list_MEA)
print(list_eval)'''
#np = [["rna", "type_score", "score"]]
d = {'rna':list_name,'score':list_score, 'type_score':list_type}
df = pd.DataFrame(d, columns=['rna','type_score','score'])
sns.stripplot(x="rna",y="score",data=df,jitter=True,hue='type_score',palette='Set1')
plt.xticks(rotation=90)
plt.savefig("compare_BiORSEOMEA_RNAeval_RNAfold.png")
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
//Count the number of '&' in the motif sequence
size_t count_delimiter(string& seq) {
size_t count = 0;
for(uint i = 0; i < seq.size(); i++) {
char c = seq.at(i);
if (c == '&') {
count++;
}
}
return count;
}
/*
If there is a '&' in the motif sequence in the field 'sequence' but not in the field 'contacts',
th script put a '&' in the same position in the field 'contacts' than in the field 'sequence'.
*/
void add_delimiter(const string& jsonfile, const string& jsonoutfile) {
std::ifstream lib(jsonfile);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
//the list of pfam lists of the motif we want to count the inclusion in other motif
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
string test;
string sequence;
string contacts;
bool is_change = false;
//cout << "id: " << id << endl;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
test = it2.key();
if (!test.compare("sequence")) {
//cout << "sequence: " << it2.value() << endl;
sequence = it2.value();
new_id[test] = it2.value();
} else if (!test.compare("contacts") ) {
contacts = it2.value();
} else {
new_id[test] = it2.value();
}
}
string tmp = "";
if (count_delimiter(contacts) != count_delimiter(sequence) && contacts.size() == sequence.size()) {
for (uint i = 0; i < sequence.size(); i++) {
if (sequence.at(i) == '&') {
tmp += "&";
} else {
tmp += contacts.at(i);
}
}
} else {
tmp = contacts;
}
new_id["contacts"] = tmp;
new_motif[id] = new_id;
new_id.clear();
}
outfile << new_motif.dump(4) << endl;
outfile.close();
}
int main()
{
string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_tmp.json";
add_delimiter(jsonfile, out);
return 0;
}
......@@ -29,7 +29,7 @@ import pickle
# ================== DEFINITION OF THE PATHS ==============================
biorseoDir = path.realpath(".")
jar3dexec = "/home/persalteas/Software/jar3dbin/jar3d_2014-12-11.jar"
jar3dexec = "/local/local/localopt/jar3d_2014-12-11.jar"
bypdir = biorseoDir + "/BayesPairing/bayespairing/src"
byp2dir = biorseoDir + "/BayesPairing2/bayespairing/src"
moipdir = "/home/persalteas/Software/RNAMoIP/Src/RNAMoIP.py"
......@@ -803,7 +803,7 @@ class Method:
else:
results_file = outputDir+f"{'' if self.allow_pk else 'no'}PK/"+basename+f".biorseo_{self.data_source.lower()}_{self.placement_method.lower()}_{self.func}"
c += ["--bayespaircsv", outputDir+basename+f".{self.data_source.lower()}_{self.placement_method.lower()}.csv"]
c += ["-o", results_file, "--func", self.func]
c += ["-o", results_file, "--func", self.func, "--MFE"]
if not self.allow_pk:
c += ["-n"]
self.joblist.append(Job(command=c, priority=4, timeout=3600,
......
......@@ -11,6 +11,12 @@
using namespace std;
using json = nlohmann::json;
/*
This script count the number of "occurrences" of the motif.
So we consider that if the sequence of pattern A is included in pattern B,
then for each inclusion of B we also have an inclusion of A. And vice versa.
*/
//Return true if the first sequence seq1 is included in the second sequence seq2
//if not return false
int is_contains(string& seq1, string& seq2) {
......@@ -38,6 +44,8 @@ int is_contains(string& seq1, string& seq2) {
//If we find the sequence and structure of pattern A in pattern B, we have to concatenate the pfam lists of A and B,
//remove the duplicates, assign this new list of pfam lists to A, and assign as occurrence to A the size of this list.
//The pattern A is counted only once in every other pattern, i.e. even if the sequence of A is found several times in B,
// it will be added only once in the occurrences of A.
void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
std::ifstream lib(jsonfile);
std::ifstream lib2(jsonfile);
......@@ -73,14 +81,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
if (!test.compare("pfam")) {
vector<vector<string>> tab = it2.value();
list_pfams = tab;
/*set<set<string>>::iterator iit;
set<string>::iterator iit2;
for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
cout << *iit2 << endl;
}
cout << endl << endl;
}*/
} else if (!test.compare("sequence")) {
//cout << "sequence: " << it2.value() << endl;
sequence = it2.value();
......@@ -124,7 +124,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
new_id[test] = it2.value();
}
}
//cout << "-------begin---------" << endl;
for (auto it3 = js2.begin(); it3 != js2.end(); ++it3) {
string id2 = it3.key();
......@@ -142,22 +141,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
if (!test.compare("pfam")) {
vector<vector<string>> tab = it4.value();
list_pfams2 = tab;
/*for (uint k = 0; k < tab2.size(); k++) {
for (uint l = 0; l < tab2[k].size(); l++) {
pfams2.insert(tab2[k][l]);
}
list_pfams2.insert(pfams);
pfams2.clear();
}*/
/*set<set<string>>::iterator iit;
set<string>::iterator iit2;
for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
cout << *iit2 << endl;
}
cout << endl << endl;
}*/
} else if (!test.compare("occurences")) {
occurences2 = it4.value();
//cout << "occurences2: "<< occurences2 << endl;
......@@ -216,7 +199,6 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
}
}
//cout << "----end----" << endl;
//}
}
if(flag) {
......@@ -242,23 +224,12 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
//cout << endl;*/
}
/*for(uint ii = 0; ii < list_pfams.size(); ii++) {
for (uint jj = 0; jj < list_pfams[ii].size(); jj++) {
cout << "[" << ii << "][" << jj << "]: " << list_pfams[ii][jj] << endl;
}
}*/
new_id["occurences"] = list_pfams.size();
new_id["pfam"] = list_pfams;
//cout << "-------ending---------" << endl;
new_id["pfam"] = list_pfams;
new_motif[id] = new_id;
new_id.clear();
//cout << "valeur: " << ite << endl;
/*for (uint i = 0; i < tab_struc.size() ; i++) {
cout << "tab_struc[" << i << "]: " << tab_struc[i] << endl << endl;
} */
}
outfile << new_motif.dump(4) << endl;
outfile.close();
......@@ -267,13 +238,11 @@ void counting_occurences(const string& jsonfile, const string& jsonoutfile) {
int main()
{
//183
//cout << "------------------BEGIN-----------------" << endl;
string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_06-06-2021.json";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
string jsonfile = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_06-06-2021.json";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
counting_occurences(jsonfile, out);
//cout << "------------------END-----------------" << endl;
return 0;
}
......
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
/*
Create a .fasta file for each of the sequence inside the benchmark in json format.
Also create a .dbn and .txt file that list the name, sequence, 2d structure and contacts for all sequence in the benchmark file.
Those files are useful for the Isaure_benchmark.py script.
*/
void create_files(const string& jsonmotifs) {
std::ifstream lib(jsonmotifs);
string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
string list = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.txt";
string dbn = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
std::ofstream outlist (list);
std::ofstream outdbn (dbn);
json js = json::parse(lib);
uint count = 0;
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
string name, seq, contacts, structure;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string chain = it2.key();
if (chain.compare("pfams") != 0) {
string name = id + "_" + chain;
string filename = fasta + name + ".fa";
std::ofstream outfasta (filename);
outfasta << ">test_" << name << endl;
for (auto it3 = js[id][chain].begin(); it3 != js[id][chain].end(); ++it3) {
string field = it3.key();
if (!field.compare("sequence")) {
seq = it3.value();
outfasta << seq.substr(0,seq.size()) << endl;
outfasta.close();
} else if (!field.compare("contacts")) {
contacts = it3.value();
} else if (!field.compare("struct2d")) {
structure = it3.value();
}
}
if(seq.find('&') == string::npos) {
outlist << ">test_" << name << endl;
outdbn << "test_" << name << "." << endl;
outlist << contacts << endl;
outdbn << seq << endl;
outdbn << structure << endl;
outdbn << contacts << endl;
outlist << seq << endl;
outlist << structure << endl;
count++;
}
}
}
}
cout << count << " sequences en tout" << endl;
lib.close();
outlist.close();
outdbn.close();
}
int main()
{
string path = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/";
string jsonbm = path + "modules/ISAURE/benchmark_16-07-2021.json";
create_files(jsonbm);
return 0;
}
#include <iostream>
#include <sstream>
#include <fstream>
#include "/local/local/BiorseoNath/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
#include <string>
using namespace std;
using json = nlohmann::json;
/*
This script is use to create a new motif library without a motif that contains the same pdb as the sequence used in input for prediction
with BiORSEO.
*/
void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
std::ifstream lib(jsonlibrary);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
vector<string> list_pdbs;
bool is_added = true;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string field = it2.key();
if (!field.compare("pdb")) {
vector<string> tab = it2.value();
list_pdbs = tab;
} else {
new_id[field] = it2.value();
}
}
if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
is_added = false;
}
if (is_added) {
new_id["pdb"] = list_pdbs;
new_motif[id] = new_id;
}
new_id.clear();
}
outfile << new_motif.dump(4) << endl;
outfile.close();
}
int main(int argc, char** argv)
{
string jsonlibrary = "/local/local/BiorseoNath/data/modules/ISAURE/motifs_final.json";
string out = "/local/local/BiorseoNath/data/modules/ISAURE/bibliotheque_a_lire/motifs_final.json";
string name = argv[1];
delete_redundant_pdb(jsonlibrary, name, out);
return 0;
}
......@@ -28,17 +28,18 @@
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import cm
import scipy.stats as st
import sys
import os
import subprocess
import getopt
class SecStruct:
def __init__(self, dot_bracket, obj1_value, obj2_value):
self.dbn = dot_bracket
self.objectives = [ obj1_value, obj2_value ]
self.objectives = [obj1_value, obj2_value]
self.basepair_list = self.get_basepairs()
self.length = len(dot_bracket)
......@@ -96,9 +97,9 @@ class SecStruct:
tn = reference_structure.length * (reference_structure.length - 1) * 0.5 - fp - fn - tp
# Compute MCC
if (tp+fp == 0):
if (tp + fp == 0):
print("We have an issue : no positives detected ! (linear structure)")
return (tp*tn-fp*fn) / sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
class Pareto:
......@@ -106,16 +107,16 @@ class Pareto:
self.predictions = list_of_structs
self.true_structure = reference
self.n_pred = len(list_of_structs)
self.max_obj1 = max([ s.objectives[0] for s in self.predictions ])
self.max_obj2 = max([ s.objectives[1] for s in self.predictions ])
self.max_obj1 = max([s.objectives[0] for s in self.predictions])
self.max_obj2 = max([s.objectives[1] for s in self.predictions])
self.index_of_best = self.find_best_solution()
def find_best_solution(self):
# returns the index of the solution of the Pareto set which is the closest
# to the real 2D structure (the one with the max MCC)
max_i = -1
max_mcc = -1
for i,s in enumerate(self.predictions):
for i, s in enumerate(self.predictions):
mcc = s.get_MCC_with(self.true_structure)
if mcc > max_mcc:
max_mcc = mcc
......@@ -125,15 +126,15 @@ class Pareto:
def get_normalized_coords(self):
# retrieves the objective values of the best solution and normlizes them
coords = self.predictions[self.index_of_best].objectives
if self.max_obj1: # avoid divide by zero if all solutions are 0
x = coords[0]/self.max_obj1
if self.max_obj1: # avoid divide by zero if all solutions are 0
x = coords[0] / self.max_obj1
else:
x = 0.5
if self.max_obj2: # avoid divide by zero if all solutions are 0
y = coords[1]/self.max_obj2
if self.max_obj2: # avoid divide by zero if all solutions are 0
y = coords[1] / self.max_obj2
else:
y = 0.5
return ( x, y )
return (x, y)
class RNA:
......@@ -145,6 +146,8 @@ class RNA:
ignored_nt_dict = {}
def is_canonical_nts(seq):
for c in seq[:-1]:
if c not in "ACGU":
......@@ -155,6 +158,7 @@ def is_canonical_nts(seq):
return False
return True
def is_canonical_bps(struct):
if "()" in struct:
return False
......@@ -203,6 +207,7 @@ def load_from_dbn(file, header_style=3):
db.close()
return container, pkcounter
def parse_biokop(folder, basename, ext=".biok"):
solutions = []
err = 0
......@@ -243,6 +248,7 @@ def parse_biokop(folder, basename, ext=".biok"):
err = 1
return None, err
def parse_biorseo(folder, basename, ext):
solutions = []
err = 0
......@@ -266,6 +272,7 @@ def parse_biorseo(folder, basename, ext):
err = 1
return None, err
def prettify_biorseo(code):
name = ""
if "bgsu" in code:
......@@ -301,8 +308,8 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
print("[%s] Loaded %d solutions in a Pareto set, max(obj1)=%f, max(obj2)=%f" % (rna.basename_, pset.n_pred, pset.max_obj1, pset.max_obj2))
print("Loaded %d points on %d." % (len(points), len(RNAcontainer)-skipped))
x = np.array([ p[0] for p in points ])
y = np.array([ p[1] for p in points ])
x = np.array([p[0] for p in points])
y = np.array([p[1] for p in points])
xmin, xmax = 0, 1
ymin, ymax = 0, 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
......@@ -316,19 +323,21 @@ def process_extension(ax, pos, ext, nsolutions=False, xlabel="Best solution perf
ax[pos].axvline(x=1, alpha=0.2, color='black')
ax[pos].contourf(xx, yy, f, cmap=cm.Blues, alpha=0.5)
ax[pos].scatter(x, y, s=25, alpha=0.1)
ax[pos].set_xlim((-0.1,1.1))
ax[pos].set_ylim((-0.1,1.1))
ax[pos].annotate("("+str(len(points))+'/'+str(len(RNAcontainer)-skipped)+" RNAs)", (0.08,0.15))
ax[pos].set_xlim((-0.1, 1.1))
ax[pos].set_ylim((-0.1, 1.1))
ax[pos].set_title(prettify_biorseo(ext[1:]), fontsize=10)
ax[pos].annotate("(" + str(len(points)) + '/' + str(len(RNAcontainer)-skipped) + " RNAs)", (0.08, 0.15))
ax[pos].set_xlabel(xlabel)
ax[pos].set_ylabel(ylabel)
if nsolutions:
ax[pos+1].hist(sizes, bins=range(0, max(sizes)+1, 2), histtype='bar')
ax[pos+1].set_xlim((0,max(sizes)+2))
ax[pos+1].set_xticks(range(0, max(sizes), 10))
ax[pos+1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
ax[pos+1].set_xlabel("# solutions")
ax[pos+1].set_ylabel("# RNAs")
ax[pos + 1].hist(sizes, bins=range(0, max(sizes) + 1, 2), histtype='bar')
ax[pos + 1].set_xlim((0, max(sizes) + 2))
ax[pos + 1].set_xticks(range(0, max(sizes), 10))
ax[pos + 1].set_xticklabels(range(0, max(sizes), 10), rotation=90)
ax[pos + 1].set_xlabel("# solutions")
ax[pos + 1].set_ylabel("# RNAs")
if __name__ == "__main__":
try:
......
This diff is collapsed. Click to expand it.
#include <iostream>
#include <sstream>
#include <fstream>
#include "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/cppsrc/json.hpp"
#include <typeinfo>
#include <set>
#include <algorithm>
#include <cstdio>
#include <vector>
using namespace std;
using json = nlohmann::json;
/*
That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from (with the same pdb).
*/
//To store the pdb and the sequence in the benchmark file. Also stor the corresponding motif id and components based on this sequence.
struct data {
//the pdb code (in the name of the sequence)
string pdb;
//the complete sequence with this pdb code
string seq_pdb;
//the id of the motif corresponding to this pdb in the library
string id;
//the module sequence with the components of this motif with the above id
string cmp;
};
typedef struct data data;
//returns the list of pdb codes and the corresponding information from the benchmark file.
vector<data> get_list_pdb_benchmark(const string& benchmark) {
fstream bm(benchmark);
vector<data> list_pdb_seq;
if (bm.is_open()) {
string name;
string sequence;
string structure;
string contacts;
while (getline(bm, name)) {
data d;
int size = name.size();
name = name.substr(5,size-6);
getline(bm, sequence);
d.pdb = name;
d.seq_pdb = sequence;
list_pdb_seq.push_back(d);
getline(bm, structure);
getline(bm, contacts);
}
bm.close();
}
return list_pdb_seq;
}
string trim(string str) {
int size = str.size();
str = str.substr(1, size-2);
return str;
}
//store the corresponding id and motif to the sequence from the benchmark file
data find_id_pattern(string& pdb_pattern, const string& benchmark) {
vector<data> l = get_list_pdb_benchmark(benchmark);
int size = l.size();
for (data d : l) {
string cmp = d.pdb;
cmp = cmp.substr(0, d.pdb.size()-2);
if (!cmp.compare(pdb_pattern)) {
return d;
}
}
return data();
}
//Create an array of data ('association'), which consists of each pdb of the benchmark file
// with the associated pattern from this sequence.
vector<data> find_id(const string& bibli, const string& benchmark) {
ifstream lib(bibli);
json js = json::parse(lib);
//nam seq_bm et id seq_id
vector<data> association;
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
data d;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string field = it2.key();
string seq;
if (!field.compare("pdb")) {
int n = js[id][field].size();
for (int i = 0; i < n ; i++) {
ostringstream stream;
stream << js[id][field][i];
string pdb = trim(stream.str());
d = find_id_pattern(pdb, benchmark);
}
}
if (!field.compare("sequence")) {
seq = it2.value();
if (!(d.pdb.empty())) {
d.id = id;
d.cmp = seq;
association.push_back(d);
}
}
}
}
lib.close();
cout << association.size() << endl;
return association;
}
//check if the motif is found matching with a complete sequence from a benchmark file.
bool does_it_match(const string& seq, const string& seq_motif) {
size_t found = seq_motif.find("&");
size_t size = seq_motif.size();
vector<string> list_cmp;
if (found != std::string::npos) {
int count = 1;
string cmp = seq_motif.substr(0, found);
list_cmp.push_back(cmp);
while(found != std::string::npos) {
size_t begin = found;
found = seq_motif.find("&", found + 1);
cmp = seq_motif.substr(begin+1, found-begin-1);
list_cmp.push_back(cmp);
count++;
}
found = seq.find(list_cmp[0]);
int count2 = 1;
while((found != std::string::npos) && (count2 < count)) {
size_t begin = found;
found = seq.find(list_cmp[count2], found + 1);
count2++;
}
if(count == count2) {
return true;
}
} else {
found = seq.find(seq_motif);
if (found != std::string::npos) {
return true;
}
}
return false;
}
//return the list of motif id that didn't match with any other complete sequence than the one which it came from.
vector<string> select_not_motif(const string& bibli, const string& benchmark) {
vector<string> selection;
vector<data> association = find_id(bibli, benchmark);
for (data d : association) {
selection.push_back(d.id);
}
for (data d : association) {
for (data d2 : association) {
string seq = d.seq_pdb;
string seq2 = d2.cmp;
bool test = false;
if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
test = does_it_match(seq, seq2);
if (test) {
cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
auto position = find(selection.begin(), selection.end(), d.id);
if (position != selection.end()) {
int index = position - selection.begin();
selection.erase(selection.begin() + index);
}
}
}
}
}
sort(selection.begin(), selection.end() );
selection.erase(unique(selection.begin(), selection.end() ), selection.end() );
cout << "size: " << selection.size() << endl;
return selection;
}
int main()
{
string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_final.json";
string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/benchmark.dbn";
/*vector<data> v = get_list_pdb_benchmark(benchmark);
for (data d : v) {
cout << d.pdb << ", " << d.seq_pdb << endl;
}*/
/*string name = "1U6P_B";
data d = find_id_pattern(name, benchmark);
cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
/*vector<data> association = find_id(bibli, benchmark);
for (data d : association) {
cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
}*/
/*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
bool test = does_it_match(seq, seq_motif);
cout << test << endl;*/
vector<string> selection = select_not_motif(bibli, benchmark);
for (string str : selection) {
cout << str << ", ";
}
cout << endl;
return 0;
}
\ No newline at end of file
This diff is collapsed. Click to expand it.
>test
CCGGGACCUCUAACCGGGUUCCCGGGCAGUCACUG