Nettoyage + Update Doc

Louis BECQUEY
Commit 5f6cb56ffdb75fa503ecde0cc5704b1ababb3c39 5f6cb56f 1 parent a5f7e06c
Showing 12 changed files with 76 additions and 78 deletions
Makefile
README.md
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/biominserter.cpp
cppsrc/rna.cpp
cppsrc/rna.h
doc/RNA.bib
doc/graph_abstract.png
doc/motif_forcing_constraints.pdf
doc/motif_forcing_constraints.tex
searchForMotifSites.py
--- a/Makefile
View file @5f6cb56
+++ b/Makefile
View file @5f6cb56
 include EditMe
 ICONCERT=$(CPLEXDir)/concert/include
 ICPLEX=$(CPLEXDir)/cplex/include
- INUPACK=/usr/local/include/nupack
 LCONCERT=$(CPLEXDir)/concert/lib/x86-64_linux/static_pic/
 LCPLEX=$(CPLEXDir)/cplex/lib/x86-64_linux/static_pic/
 
@@ -10,12 +9,12 @@ TARGET   = biominserter
 
 CC	   = clang++
 # compiling flags here
- CFLAGS   = -Icppsrc/ -I/usr/local/include -I$(ICONCERT) -I$(ICPLEX) -I$(INUPACK) -I$(IEIGEN) -I$(IEIGEN)/unsupported -g -fopenmp=libomp
+ CFLAGS   = -Icppsrc/ -I/usr/local/include -I$(ICONCERT) -I$(ICPLEX) -I$(INUPACK) -I$(IEIGEN) -O3
 CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-ignored-attributes -Wno-unused-variable
 
 LINKER   = clang++
 # linking flags here
- LDFLAGS   = -lRNA -lconcert -lilocplex -lcplex -lm -lomp -lpthread -ldl -lboost_system -lboost_filesystem -lboost_program_options -L$(LCONCERT) -L$(LCPLEX) -lnupackpfunc -lnupackutils -lnupackconc
+ LDFLAGS   = -L$(LCONCERT) -L$(LCPLEX) -lconcert -lilocplex -lcplex -lpthread -ldl -lnupackpfunc -lnupackutils
 
 # change these to proper directories where each file should be
 SRCDIR   = cppsrc
--- a/README.md
View file @5f6cb56
+++ b/README.md
View file @5f6cb56
 This is a bi-objective integer programming algorithm.
 It predicts the secondary structure of a RNA sequence with pieces of 3D information (non-canonical contacts) at some places, 
- by identifying zones that can fold like known motifs.
+ by identifying zones that can fold like known motifs from the RNA 3D Motif Atlas.
 
 1/ How it works
 ===================================
@@ -9,7 +9,7 @@ INPUT:
 
 THEN
 - Identifies possible 2D folds with RNAsubopt.
- - Knowing possible 2D folds, locate every possibly unpaired loop (hairpin loop, internal loop, multiple junction...)
+ - Knowing possible 2D folds, locate every possibly unpaired loop (hairpin loop, internal loop, multiple loop...)
 - align each unpaired loop to the catalogue of models of known RNA motifs (The 3D Motif Atlas of the BGSU RNA group)
 - retrieve a list of potential motif-insertion-sites in the RNA sequence. Use them to define the constraints for the IP problem.
 - Solve a bi-objective IP problem: 
@@ -19,7 +19,7 @@ THEN
 OUTPUT:
 - A set of secondary structures from the pareto front,
 - The list of known motif inserted in the corresponding structures (and the non-canonical contacts)
- - (Optionally, lower score structures from k-Pareto sets.)
+ - (lower score structures from k-Pareto sets, not implemented yet.)
 
 2/ Installation
 ==================================
--- a/cppsrc/MOIP.cpp
View file @5f6cb56
+++ b/cppsrc/MOIP.cpp
View file @5f6cb56
 #include "MOIP.h"
- #include "SecondaryStructure.h"
- #include "rna.h"
 #include <algorithm>
 #include <boost/format.hpp>
 #include <cfloat>
@@ -19,8 +17,6 @@ using std::endl;
 using std::make_pair;
 using std::vector;
 
- uint MOIP::ncores = 0;
- 
 MOIP::MOIP(const RNA& rna, const vector<Motif>& insertionSites, float pthreshold)
 : rna_(rna), insertion_sites_(insertionSites), theta_{pthreshold}
 {
@@ -186,7 +182,7 @@ void MOIP::define_problem_constraints(void)
     cout << "\t>ensuring there are at most 1 pairing by nucleotide..." << endl;
     uint u, v, count;
     uint n = rna_.get_RNA_length();
-     for (u = 0; u < n - 6; u++) {
+     for (u = 0; u < n; u++) {
         count = 0;
         IloExpr c1(env_);
         for (v = 0; v < u; v++)
@@ -206,7 +202,7 @@ void MOIP::define_problem_constraints(void)
     }
     // forbid lonely basepairs
     cout << "\t>forbidding lonely basepairs..." << endl;
-     for (u = 0; u < n - 6; u++) {
+     for (u = 0; u < n; u++) {
         IloExpr c2(env_);    // for the case where s[u] is paired to s[v], v>u
         count = 0;
         for (v = u; v < n; v++)
@@ -223,7 +219,7 @@ void MOIP::define_problem_constraints(void)
             cout << "\t\t" << (c2 >= 0) << endl;
         }
     }
-     for (v = 5; v < n; v++) {
+     for (v = 0; v < n; v++) {
         IloExpr c2p(env_);    // for the case where s[u] is paired to s[v], v<u
         count = 0;
         for (u = 0; u <= v - 2; u++)
--- a/cppsrc/MOIP.h
View file @5f6cb56
+++ b/cppsrc/MOIP.h
View file @5f6cb56
@@ -15,8 +15,6 @@ const double PRECISION = 0.0001;
 class MOIP
 {
     public:
-     static uint ncores;
- 
     MOIP(const RNA& rna, const vector<Motif>& motifSites, float pthreshold);
     ~MOIP(void);
     SecondaryStructure        solve_objective(int o, double min, double max);
--- a/cppsrc/biominserter.cpp
View file @5f6cb56
+++ b/cppsrc/biominserter.cpp
View file @5f6cb56
 /***
         Biominserter, Louis Becquey, nov 2018
-         inspired but not copied from Biokop, IPKnot and Nupack
 ***/
 
 #include <algorithm>
@@ -32,12 +31,6 @@ Motif parse_csv_line(string line)
 
 int main(int argc, char* argv[])
 {
-     // float   time;
-     // clock_t t1, t2;
-     // t1 = clock();
- 
-     MOIP::ncores = thread::hardware_concurrency() - 1;
- 
     if (argc != 4) {
         cerr << argc << " arguments specified !" << endl;
         cerr << "Please specify the following input files:" << endl;
--- a/cppsrc/rna.cpp
View file @5f6cb56
+++ b/cppsrc/rna.cpp
View file @5f6cb56
 #include <iostream>
- #include <string>
- #include <vector>
- #include <iostream>
 // extern "C" {
 // #include <ViennaRNA/fold.h>
 // #include <ViennaRNA/part_func.h>
@@ -16,7 +13,7 @@ extern "C" {
 
 extern DBL_TYPE* pairPrPbg;    // for pseudoknots
 extern DBL_TYPE* pairPrPb;     // for pseudoknots
- extern double CUTOFF;
+ extern double    CUTOFF;
 
 using namespace Eigen;
 using std::cerr;
@@ -27,9 +24,8 @@ using std::string;
 using std::vector;
 
 RNA::RNA(string name, string seq)
- : pair_map(Matrix<pair_t, 5, 5>::Constant(PAIR_OTHER)), name_(name), seq_(seq), n_(seq.size()), pij_(MatrixXf::Zero(n_, n_))
+ : name_(name), seq_(seq), n_(seq.size()), pij_(MatrixXf::Zero(n_, n_))    // pair_map(Matrix<pair_t, 5, 5>::Constant(PAIR_OTHER)),
 {
-     bseq_.reserve(n_);
     vector<char> unknown_chars;
     bool         contains_T = false;
     for (char c : seq) {
@@ -40,7 +36,6 @@ RNA::RNA(string name, string seq)
         if (base_type(c) == BASE_N) {
             unknown_chars.push_back(c);
         }
-         bseq_.push_back(base_type(c));
     }
     if (contains_T) cout << "\tWARNING: Thymines automatically replaced by uraciles.";
     if (unknown_chars.size() > 0) {
@@ -48,12 +43,6 @@ RNA::RNA(string name, string seq)
         for (char c : unknown_chars) cout << c << " ";
         cout << endl;
     }
-     pair_map(BASE_A, BASE_U) = PAIR_AU;
-     pair_map(BASE_U, BASE_A) = PAIR_UA;
-     pair_map(BASE_C, BASE_G) = PAIR_CG;
-     pair_map(BASE_G, BASE_C) = PAIR_GC;
-     pair_map(BASE_G, BASE_U) = PAIR_GU;
-     pair_map(BASE_U, BASE_G) = PAIR_UG;
     cout << "\t>sequence formatted" << endl;
 
     // // Compute using ViennaRNA
@@ -75,11 +64,11 @@ RNA::RNA(string name, string seq)
 
     // Compute using Nupack
     cout << "\t>computing pairing probabilities..." << endl;
-     DBL_TYPE                 pf;
-     int                    length, tmpLength;
-     int                    i, j, q, r;
-     char                   seqChar[MAXSEQLENGTH];    // Complete sequence
-     int                    seqNum[MAXSEQLENGTH + 1];
+     DBL_TYPE pf;
+     int      length, tmpLength;
+     int      i, j, q, r;
+     char     seqChar[MAXSEQLENGTH];    // Complete sequence
+     int      seqNum[MAXSEQLENGTH + 1];
 
     // Init parameters
     DANGLETYPE                         = 1;
@@ -105,18 +94,19 @@ RNA::RNA(string name, string seq)
 
     // cout << "\t\t>Base pair probabilities:" << endl;
     for (i = 0; i < length; i++) {
-         for (j = i + 1; j < length; j++) { // upper diagonal
-             pij_(i,j) = pairPr[(length + 1) * i + j];
-             // printf("\t\t%d %d\t%.4Le + %.4Le = %.4Le\t%s\n",i + 1,j + 1, pairPrPb[(length + 1) * i + j], pairPrPbg[(length + 1) * i + j], pairPr[(length + 1) * i + j], pairPrPb[(length + 1) * i + j]+ pairPrPbg[(length + 1) * i + j] == pairPr[(length + 1) * i + j] ? "CHECK" : "ERROR");
+         for (j = i + 1; j < length; j++) {    // upper diagonal
+             pij_(i, j) = pairPr[(length + 1) * i + j];
+             // printf("\t\t%d %d\t%.4Le + %.4Le = %.4Le\t%s\n",i + 1,j + 1, pairPrPb[(length + 1) * i + j], pairPrPbg[(length
+             // + 1) * i + j], pairPr[(length + 1) * i + j], pairPrPb[(length + 1) * i + j]+ pairPrPbg[(length + 1) * i + j] == pairPr[(length + 1) * i + j] ? "CHECK" : "ERROR");
         }
     }
     cout << endl << "\t\t>Fast checking..." << endl;
     vector<double> p_unpaired = vector<double>(n_, 0.0);
     for (i = 0; i < length; i++) {
-         p_unpaired [i] = pairPr[(length + 1) * i + j];
-         double sum = 0.0;
-         for (j = 0; j<length; j++) sum += pij_(i,j);
-         printf("\t\t%d\tunpaired: %.4e\tpaired(pK+noPK): %.4e\tTotal: %f\n", i + 1, p_unpaired[i], sum, p_unpaired[i]+sum);
+         p_unpaired[i] = pairPr[(length + 1) * i + j];
+         double sum    = 0.0;
+         for (j = 0; j < length; j++) sum += pij_(i, j);
+         printf("\t\t%d\tunpaired: %.4e\tpaired(pK+noPK): %.4e\tTotal: %f\n", i + 1, p_unpaired[i], sum, p_unpaired[i] + sum);
     }
     free(pairPr);
     free(pairPrPbg);
--- a/cppsrc/rna.h
View file @5f6cb56
+++ b/cppsrc/rna.h
View file @5f6cb56
@@ -15,7 +15,7 @@ using std::string;
 using std::vector;
 
 #ifndef NUPACK_SHARED_CONSTANTS_H__
- enum base_t { BASE_N = 0, BASE_A, BASE_C, BASE_G, BASE_U }; // Comment if you include nupack/shared.h
+ enum base_t { BASE_N = 0, BASE_A, BASE_C, BASE_G, BASE_U };    // Comment if you include nupack/shared.h
 #else
 typedef int base_t;
 #endif
@@ -30,24 +30,18 @@ class RNA
     string get_seq(void) const;
     uint   get_RNA_length(void) const;
     void   print_basepair_p_matrix(float theta) const;
-     bool   is_canonical_basepair(size_t u, size_t v) const;
 
     private:
-     base_t                                   base_type(char x) const;
-     pair_t                                   pair_type(int i, int j) const;
- 
-     Matrix<pair_t, 5, 5> pair_map;
-     string               name_;    // name of the rna
-     string               seq_;     // sequence of the rna with chars
-     vector<base_t>       bseq_;    // sequence of the rna with base_ts
-     uint                 n_;       // length of the rna
-     MatrixXf             pij_;     // matrix of basepair probabilities
+     base_t base_type(char x) const;
+ 
+     string   name_;    // name of the rna
+     string   seq_;     // sequence of the rna with chars
+     uint     n_;       // length of the rna
+     MatrixXf pij_;     // matrix of basepair probabilities
 };
 
 inline float  RNA::get_pij(int i, int j) { return pij_(i, j); }
 inline uint   RNA::get_RNA_length() const { return n_; }
- inline pair_t RNA::pair_type(int i, int j) const { return pair_map(bseq_[i], bseq_[j]); }
 inline string RNA::get_seq(void) const { return seq_; }
- inline bool   RNA::is_canonical_basepair(size_t u, size_t v) const { return pair_type(u, v) != PAIR_OTHER; }
 
 #endif
--- a/doc/RNA.bib
View file @5f6cb56
+++ b/doc/RNA.bib
View file @5f6cb56
 
+ 
+ @article{cruz2011sequence,
+   title={Sequence-based identification of 3D structural modules in RNA with RMDetect},
+   author={Cruz, Jos{\'e} Almeida and Westhof, Eric},
+   journal={Nature methods},
+   volume={8},
+   number={6},
+   pages={513},
+   year={2011},
+   publisher={Nature Publishing Group}
+ }
+ 
 @article{djelloul_automated_2008,
 	title = {Automated motif extraction and classification in {RNA} tertiary structures},
 	volume = {14},
@@ -340,23 +352,6 @@
 	file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/4YMW5M4S/Legendre et al. - 2018 - Bi-objective integer programming for RNA secondary.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/XP46BMHH/s12859-018-2007-7.html:text/html}
 }
 
- @article{legendre_bi-objective_2018-1,
- 	title = {Bi-objective integer programming for {RNA} secondary structure prediction with pseudoknots},
- 	volume = {19},
- 	issn = {1471-2105},
- 	url = {https://doi.org/10.1186/s12859-018-2007-7},
- 	doi = {10.1186/s12859-018-2007-7},
- 	abstract = {RNA structure prediction is an important field in bioinformatics, and numerous methods and tools have been proposed. Pseudoknots are specific motifs of RNA secondary structures that are difficult to predict. Almost all existing methods are based on a single model and return one solution, often missing the real structure. An alternative approach would be to combine different models and return a (small) set of solutions, maximizing its quality and diversity in order to increase the probability that it contains the real structure.},
- 	number = {1},
- 	urldate = {2018-10-08},
- 	journal = {BMC Bioinformatics},
- 	author = {Legendre, Audrey and Angel, Eric and Tahi, Fariza},
- 	month = jan,
- 	year = {2018},
- 	pages = {13},
- 	file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/J36PRAPX/Legendre et al. - 2018 - Bi-objective integer programming for RNA secondary.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/BZCRBGLK/s12859-018-2007-7.html:text/html}
- }
- 
 @article{sarver_fr3d:_2008,
 	title = {{FR}3D: finding local and composite recurrent structural motifs in {RNA} 3D structures},
 	volume = {56},
@@ -450,6 +445,25 @@
 	file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/RWXNJQH6/Antczak et al. - 2014 - RNApdbee—a webserver to derive secondary structure.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/H6CZVH4F/2435287.html:text/html}
 }
 
+ 
+ @article{dirksAlgorithmComputingNucleic2004,
+   title = {An Algorithm for Computing Nucleic Acid Base-Pairing Probabilities Including Pseudoknots},
+   volume = {25},
+   copyright = {Copyright \textcopyright{} 2004 Wiley Periodicals, Inc.},
+   issn = {1096-987X},
+   doi = {10.1002/jcc.20057},
+   abstract = {Given a nucleic acid sequence, a recent algorithm allows the calculation of the partition function over secondary structure space including a class of physically relevant pseudoknots. Here, we present a method for computing base-pairing probabilities starting from the output of this partition function algorithm. The approach relies on the calculation of recursion probabilities that are computed by backtracking through the partition function algorithm, applying a particular transformation at each step. This transformation is applicable to any partition function algorithm that follows the same basic dynamic programming paradigm. Base-pairing probabilities are useful for analyzing the equilibrium ensemble properties of natural and engineered nucleic acids, as demonstrated for a human telomerase RNA and a synthetic DNA nanostructure. \textcopyright{} 2004 Wiley Periodicals, Inc. J Comput Chem 25: 1295\textendash{}1304, 2004},
+   language = {en},
+   number = {10},
+   journal = {Journal of Computational Chemistry},
+   author = {Dirks, Robert M. and Pierce, Niles A.},
+   year = {2004},
+   keywords = {pseudoknots,RNA,DNA,base-pairing probabilities,partition function},
+   pages = {1295-1304},
+   file = {/nhome/siniac/lbecquey/Zotero/storage/LA8RHBVN/jcc.html}
+ }
+ 
+ 
 @article{laing_computational_2010,
 	title = {Computational approaches to 3D modeling of {RNA}},
 	volume = {22},
@@ -500,6 +514,18 @@
 	file = {ScienceDirect Snapshot:/nhome/siniac/lbecquey/Zotero/storage/LWS5RU5Z/S0959440X11000674.html:text/html}
 }
 
+ 
+ @article{reinharz2018mining,
+   title={Mining for recurrent long-range interactions in RNA structures reveals embedded hierarchies in network families},
+   author={Reinharz, Vladimir and Soul{\'e}, Antoine and Westhof, Eric and Waldisp{\"u}hl, J{\'e}r{\^o}me and Denise, Alain},
+   journal={Nucleic Acids Research},
+   volume={46},
+   number={8},
+   pages={3841--3851},
+   year={2018},
+   publisher={Oxford University Press}
+ }
+ 
 @article{sato_ipknot:_2011,
 	title = {{IPknot}: fast and accurate prediction of {RNA} secondary structures with pseudoknots using integer programming},
 	volume = {27},
@@ -520,6 +546,7 @@
 	file = {Texte intégral:/nhome/siniac/lbecquey/Zotero/storage/EEWT77EA/Sato et al. - 2011 - IPknot fast and accurate prediction of RNA second.pdf:application/pdf}
 }
 
+ 
 @article{zirbel_identifying_2015,
 	title = {Identifying novel sequence variants of {RNA} 3D motifs},
 	volume = {43},
@@ -537,4 +564,4 @@
 	pmcid = {PMC4551918},
 	pages = {7504--7520},
 	file = {PubMed Central Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/C68JKL5J/Zirbel et al. - 2015 - Identifying novel sequence variants of RNA 3D moti.pdf:application/pdf}
- }
\ No newline at end of file
+ }
--- a/doc/graph_abstract.png 0 → 100644
View file @5f6cb56
+++ b/doc/graph_abstract.png 0 → 100644
View file @5f6cb56
--- a/doc/motif_forcing_constraints.pdf
View file @5f6cb56
+++ b/doc/motif_forcing_constraints.pdf
View file @5f6cb56
--- a/doc/motif_forcing_constraints.tex
View file @5f6cb56
+++ b/doc/motif_forcing_constraints.tex
View file @5f6cb56
--- a/searchForMotifSites.py
View file @5f6cb56
+++ b/searchForMotifSites.py
View file @5f6cb56
@@ -3,6 +3,7 @@
 from sys import argv
 import subprocess
 import inspect
+ import RNA
 from multiprocessing import Pool, TimeoutError, cpu_count
 from os import path, makedirs, getcwd, chdir, devnull