Louis BECQUEY

Nettoyage + Update Doc

include EditMe
ICONCERT=$(CPLEXDir)/concert/include
ICPLEX=$(CPLEXDir)/cplex/include
INUPACK=/usr/local/include/nupack
LCONCERT=$(CPLEXDir)/concert/lib/x86-64_linux/static_pic/
LCPLEX=$(CPLEXDir)/cplex/lib/x86-64_linux/static_pic/
......@@ -10,12 +9,12 @@ TARGET = biominserter
CC = clang++
# compiling flags here
CFLAGS = -Icppsrc/ -I/usr/local/include -I$(ICONCERT) -I$(ICPLEX) -I$(INUPACK) -I$(IEIGEN) -I$(IEIGEN)/unsupported -g -fopenmp=libomp
CFLAGS = -Icppsrc/ -I/usr/local/include -I$(ICONCERT) -I$(ICPLEX) -I$(INUPACK) -I$(IEIGEN) -O3
CXXFLAGS = --std=c++17 -Wall -Wpedantic -Wextra -Wno-ignored-attributes -Wno-unused-variable
LINKER = clang++
# linking flags here
LDFLAGS = -lRNA -lconcert -lilocplex -lcplex -lm -lomp -lpthread -ldl -lboost_system -lboost_filesystem -lboost_program_options -L$(LCONCERT) -L$(LCPLEX) -lnupackpfunc -lnupackutils -lnupackconc
LDFLAGS = -L$(LCONCERT) -L$(LCPLEX) -lconcert -lilocplex -lcplex -lpthread -ldl -lnupackpfunc -lnupackutils
# change these to proper directories where each file should be
SRCDIR = cppsrc
......
This is a bi-objective integer programming algorithm.
It predicts the secondary structure of a RNA sequence with pieces of 3D information (non-canonical contacts) at some places,
by identifying zones that can fold like known motifs.
by identifying zones that can fold like known motifs from the RNA 3D Motif Atlas.
1/ How it works
===================================
......@@ -9,7 +9,7 @@ INPUT:
THEN
- Identifies possible 2D folds with RNAsubopt.
- Knowing possible 2D folds, locate every possibly unpaired loop (hairpin loop, internal loop, multiple junction...)
- Knowing possible 2D folds, locate every possibly unpaired loop (hairpin loop, internal loop, multiple loop...)
- align each unpaired loop to the catalogue of models of known RNA motifs (The 3D Motif Atlas of the BGSU RNA group)
- retrieve a list of potential motif-insertion-sites in the RNA sequence. Use them to define the constraints for the IP problem.
- Solve a bi-objective IP problem:
......@@ -19,7 +19,7 @@ THEN
OUTPUT:
- A set of secondary structures from the pareto front,
- The list of known motif inserted in the corresponding structures (and the non-canonical contacts)
- (Optionally, lower score structures from k-Pareto sets.)
- (lower score structures from k-Pareto sets, not implemented yet.)
2/ Installation
==================================
......
#include "MOIP.h"
#include "SecondaryStructure.h"
#include "rna.h"
#include <algorithm>
#include <boost/format.hpp>
#include <cfloat>
......@@ -19,8 +17,6 @@ using std::endl;
using std::make_pair;
using std::vector;
uint MOIP::ncores = 0;
MOIP::MOIP(const RNA& rna, const vector<Motif>& insertionSites, float pthreshold)
: rna_(rna), insertion_sites_(insertionSites), theta_{pthreshold}
{
......@@ -186,7 +182,7 @@ void MOIP::define_problem_constraints(void)
cout << "\t>ensuring there are at most 1 pairing by nucleotide..." << endl;
uint u, v, count;
uint n = rna_.get_RNA_length();
for (u = 0; u < n - 6; u++) {
for (u = 0; u < n; u++) {
count = 0;
IloExpr c1(env_);
for (v = 0; v < u; v++)
......@@ -206,7 +202,7 @@ void MOIP::define_problem_constraints(void)
}
// forbid lonely basepairs
cout << "\t>forbidding lonely basepairs..." << endl;
for (u = 0; u < n - 6; u++) {
for (u = 0; u < n; u++) {
IloExpr c2(env_); // for the case where s[u] is paired to s[v], v>u
count = 0;
for (v = u; v < n; v++)
......@@ -223,7 +219,7 @@ void MOIP::define_problem_constraints(void)
cout << "\t\t" << (c2 >= 0) << endl;
}
}
for (v = 5; v < n; v++) {
for (v = 0; v < n; v++) {
IloExpr c2p(env_); // for the case where s[u] is paired to s[v], v<u
count = 0;
for (u = 0; u <= v - 2; u++)
......
......@@ -15,8 +15,6 @@ const double PRECISION = 0.0001;
class MOIP
{
public:
static uint ncores;
MOIP(const RNA& rna, const vector<Motif>& motifSites, float pthreshold);
~MOIP(void);
SecondaryStructure solve_objective(int o, double min, double max);
......
/***
Biominserter, Louis Becquey, nov 2018
inspired but not copied from Biokop, IPKnot and Nupack
***/
#include <algorithm>
......@@ -32,12 +31,6 @@ Motif parse_csv_line(string line)
int main(int argc, char* argv[])
{
// float time;
// clock_t t1, t2;
// t1 = clock();
MOIP::ncores = thread::hardware_concurrency() - 1;
if (argc != 4) {
cerr << argc << " arguments specified !" << endl;
cerr << "Please specify the following input files:" << endl;
......
#include <iostream>
#include <string>
#include <vector>
#include <iostream>
// extern "C" {
// #include <ViennaRNA/fold.h>
// #include <ViennaRNA/part_func.h>
......@@ -16,7 +13,7 @@ extern "C" {
extern DBL_TYPE* pairPrPbg; // for pseudoknots
extern DBL_TYPE* pairPrPb; // for pseudoknots
extern double CUTOFF;
extern double CUTOFF;
using namespace Eigen;
using std::cerr;
......@@ -27,9 +24,8 @@ using std::string;
using std::vector;
RNA::RNA(string name, string seq)
: pair_map(Matrix<pair_t, 5, 5>::Constant(PAIR_OTHER)), name_(name), seq_(seq), n_(seq.size()), pij_(MatrixXf::Zero(n_, n_))
: name_(name), seq_(seq), n_(seq.size()), pij_(MatrixXf::Zero(n_, n_)) // pair_map(Matrix<pair_t, 5, 5>::Constant(PAIR_OTHER)),
{
bseq_.reserve(n_);
vector<char> unknown_chars;
bool contains_T = false;
for (char c : seq) {
......@@ -40,7 +36,6 @@ RNA::RNA(string name, string seq)
if (base_type(c) == BASE_N) {
unknown_chars.push_back(c);
}
bseq_.push_back(base_type(c));
}
if (contains_T) cout << "\tWARNING: Thymines automatically replaced by uraciles.";
if (unknown_chars.size() > 0) {
......@@ -48,12 +43,6 @@ RNA::RNA(string name, string seq)
for (char c : unknown_chars) cout << c << " ";
cout << endl;
}
pair_map(BASE_A, BASE_U) = PAIR_AU;
pair_map(BASE_U, BASE_A) = PAIR_UA;
pair_map(BASE_C, BASE_G) = PAIR_CG;
pair_map(BASE_G, BASE_C) = PAIR_GC;
pair_map(BASE_G, BASE_U) = PAIR_GU;
pair_map(BASE_U, BASE_G) = PAIR_UG;
cout << "\t>sequence formatted" << endl;
// // Compute using ViennaRNA
......@@ -75,11 +64,11 @@ RNA::RNA(string name, string seq)
// Compute using Nupack
cout << "\t>computing pairing probabilities..." << endl;
DBL_TYPE pf;
int length, tmpLength;
int i, j, q, r;
char seqChar[MAXSEQLENGTH]; // Complete sequence
int seqNum[MAXSEQLENGTH + 1];
DBL_TYPE pf;
int length, tmpLength;
int i, j, q, r;
char seqChar[MAXSEQLENGTH]; // Complete sequence
int seqNum[MAXSEQLENGTH + 1];
// Init parameters
DANGLETYPE = 1;
......@@ -105,18 +94,19 @@ RNA::RNA(string name, string seq)
// cout << "\t\t>Base pair probabilities:" << endl;
for (i = 0; i < length; i++) {
for (j = i + 1; j < length; j++) { // upper diagonal
pij_(i,j) = pairPr[(length + 1) * i + j];
// printf("\t\t%d %d\t%.4Le + %.4Le = %.4Le\t%s\n",i + 1,j + 1, pairPrPb[(length + 1) * i + j], pairPrPbg[(length + 1) * i + j], pairPr[(length + 1) * i + j], pairPrPb[(length + 1) * i + j]+ pairPrPbg[(length + 1) * i + j] == pairPr[(length + 1) * i + j] ? "CHECK" : "ERROR");
for (j = i + 1; j < length; j++) { // upper diagonal
pij_(i, j) = pairPr[(length + 1) * i + j];
// printf("\t\t%d %d\t%.4Le + %.4Le = %.4Le\t%s\n",i + 1,j + 1, pairPrPb[(length + 1) * i + j], pairPrPbg[(length
// + 1) * i + j], pairPr[(length + 1) * i + j], pairPrPb[(length + 1) * i + j]+ pairPrPbg[(length + 1) * i + j] == pairPr[(length + 1) * i + j] ? "CHECK" : "ERROR");
}
}
cout << endl << "\t\t>Fast checking..." << endl;
vector<double> p_unpaired = vector<double>(n_, 0.0);
for (i = 0; i < length; i++) {
p_unpaired [i] = pairPr[(length + 1) * i + j];
double sum = 0.0;
for (j = 0; j<length; j++) sum += pij_(i,j);
printf("\t\t%d\tunpaired: %.4e\tpaired(pK+noPK): %.4e\tTotal: %f\n", i + 1, p_unpaired[i], sum, p_unpaired[i]+sum);
p_unpaired[i] = pairPr[(length + 1) * i + j];
double sum = 0.0;
for (j = 0; j < length; j++) sum += pij_(i, j);
printf("\t\t%d\tunpaired: %.4e\tpaired(pK+noPK): %.4e\tTotal: %f\n", i + 1, p_unpaired[i], sum, p_unpaired[i] + sum);
}
free(pairPr);
free(pairPrPbg);
......
......@@ -15,7 +15,7 @@ using std::string;
using std::vector;
#ifndef NUPACK_SHARED_CONSTANTS_H__
enum base_t { BASE_N = 0, BASE_A, BASE_C, BASE_G, BASE_U }; // Comment if you include nupack/shared.h
enum base_t { BASE_N = 0, BASE_A, BASE_C, BASE_G, BASE_U }; // Comment if you include nupack/shared.h
#else
typedef int base_t;
#endif
......@@ -30,24 +30,18 @@ class RNA
string get_seq(void) const;
uint get_RNA_length(void) const;
void print_basepair_p_matrix(float theta) const;
bool is_canonical_basepair(size_t u, size_t v) const;
private:
base_t base_type(char x) const;
pair_t pair_type(int i, int j) const;
Matrix<pair_t, 5, 5> pair_map;
string name_; // name of the rna
string seq_; // sequence of the rna with chars
vector<base_t> bseq_; // sequence of the rna with base_ts
uint n_; // length of the rna
MatrixXf pij_; // matrix of basepair probabilities
base_t base_type(char x) const;
string name_; // name of the rna
string seq_; // sequence of the rna with chars
uint n_; // length of the rna
MatrixXf pij_; // matrix of basepair probabilities
};
inline float RNA::get_pij(int i, int j) { return pij_(i, j); }
inline uint RNA::get_RNA_length() const { return n_; }
inline pair_t RNA::pair_type(int i, int j) const { return pair_map(bseq_[i], bseq_[j]); }
inline string RNA::get_seq(void) const { return seq_; }
inline bool RNA::is_canonical_basepair(size_t u, size_t v) const { return pair_type(u, v) != PAIR_OTHER; }
#endif
......
@article{cruz2011sequence,
title={Sequence-based identification of 3D structural modules in RNA with RMDetect},
author={Cruz, Jos{\'e} Almeida and Westhof, Eric},
journal={Nature methods},
volume={8},
number={6},
pages={513},
year={2011},
publisher={Nature Publishing Group}
}
@article{djelloul_automated_2008,
title = {Automated motif extraction and classification in {RNA} tertiary structures},
volume = {14},
......@@ -340,23 +352,6 @@
file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/4YMW5M4S/Legendre et al. - 2018 - Bi-objective integer programming for RNA secondary.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/XP46BMHH/s12859-018-2007-7.html:text/html}
}
@article{legendre_bi-objective_2018-1,
title = {Bi-objective integer programming for {RNA} secondary structure prediction with pseudoknots},
volume = {19},
issn = {1471-2105},
url = {https://doi.org/10.1186/s12859-018-2007-7},
doi = {10.1186/s12859-018-2007-7},
abstract = {RNA structure prediction is an important field in bioinformatics, and numerous methods and tools have been proposed. Pseudoknots are specific motifs of RNA secondary structures that are difficult to predict. Almost all existing methods are based on a single model and return one solution, often missing the real structure. An alternative approach would be to combine different models and return a (small) set of solutions, maximizing its quality and diversity in order to increase the probability that it contains the real structure.},
number = {1},
urldate = {2018-10-08},
journal = {BMC Bioinformatics},
author = {Legendre, Audrey and Angel, Eric and Tahi, Fariza},
month = jan,
year = {2018},
pages = {13},
file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/J36PRAPX/Legendre et al. - 2018 - Bi-objective integer programming for RNA secondary.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/BZCRBGLK/s12859-018-2007-7.html:text/html}
}
@article{sarver_fr3d:_2008,
title = {{FR}3D: finding local and composite recurrent structural motifs in {RNA} 3D structures},
volume = {56},
......@@ -450,6 +445,25 @@
file = {Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/RWXNJQH6/Antczak et al. - 2014 - RNApdbee—a webserver to derive secondary structure.pdf:application/pdf;Snapshot:/nhome/siniac/lbecquey/Zotero/storage/H6CZVH4F/2435287.html:text/html}
}
@article{dirksAlgorithmComputingNucleic2004,
title = {An Algorithm for Computing Nucleic Acid Base-Pairing Probabilities Including Pseudoknots},
volume = {25},
copyright = {Copyright \textcopyright{} 2004 Wiley Periodicals, Inc.},
issn = {1096-987X},
doi = {10.1002/jcc.20057},
abstract = {Given a nucleic acid sequence, a recent algorithm allows the calculation of the partition function over secondary structure space including a class of physically relevant pseudoknots. Here, we present a method for computing base-pairing probabilities starting from the output of this partition function algorithm. The approach relies on the calculation of recursion probabilities that are computed by backtracking through the partition function algorithm, applying a particular transformation at each step. This transformation is applicable to any partition function algorithm that follows the same basic dynamic programming paradigm. Base-pairing probabilities are useful for analyzing the equilibrium ensemble properties of natural and engineered nucleic acids, as demonstrated for a human telomerase RNA and a synthetic DNA nanostructure. \textcopyright{} 2004 Wiley Periodicals, Inc. J Comput Chem 25: 1295\textendash{}1304, 2004},
language = {en},
number = {10},
journal = {Journal of Computational Chemistry},
author = {Dirks, Robert M. and Pierce, Niles A.},
year = {2004},
keywords = {pseudoknots,RNA,DNA,base-pairing probabilities,partition function},
pages = {1295-1304},
file = {/nhome/siniac/lbecquey/Zotero/storage/LA8RHBVN/jcc.html}
}
@article{laing_computational_2010,
title = {Computational approaches to 3D modeling of {RNA}},
volume = {22},
......@@ -500,6 +514,18 @@
file = {ScienceDirect Snapshot:/nhome/siniac/lbecquey/Zotero/storage/LWS5RU5Z/S0959440X11000674.html:text/html}
}
@article{reinharz2018mining,
title={Mining for recurrent long-range interactions in RNA structures reveals embedded hierarchies in network families},
author={Reinharz, Vladimir and Soul{\'e}, Antoine and Westhof, Eric and Waldisp{\"u}hl, J{\'e}r{\^o}me and Denise, Alain},
journal={Nucleic Acids Research},
volume={46},
number={8},
pages={3841--3851},
year={2018},
publisher={Oxford University Press}
}
@article{sato_ipknot:_2011,
title = {{IPknot}: fast and accurate prediction of {RNA} secondary structures with pseudoknots using integer programming},
volume = {27},
......@@ -520,6 +546,7 @@
file = {Texte intégral:/nhome/siniac/lbecquey/Zotero/storage/EEWT77EA/Sato et al. - 2011 - IPknot fast and accurate prediction of RNA second.pdf:application/pdf}
}
@article{zirbel_identifying_2015,
title = {Identifying novel sequence variants of {RNA} 3D motifs},
volume = {43},
......@@ -537,4 +564,4 @@
pmcid = {PMC4551918},
pages = {7504--7520},
file = {PubMed Central Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/C68JKL5J/Zirbel et al. - 2015 - Identifying novel sequence variants of RNA 3D moti.pdf:application/pdf}
}
\ No newline at end of file
}
......
This diff is collapsed. Click to expand it.
......@@ -3,6 +3,7 @@
from sys import argv
import subprocess
import inspect
import RNA
from multiprocessing import Pool, TimeoutError, cpu_count
from os import path, makedirs, getcwd, chdir, devnull
......