Louis BECQUEY

Builtin option to allow/forbid pseudoknots

EditMe
rna1999.dG
.vscode/*
.vscode
# LaTeX temporary files
doc/*.toc
......
......@@ -20,6 +20,7 @@ using std::vector;
uint MOIP::obj_function_nbr_ = 1;
uint MOIP::obj_to_solve_ = 1;
double MOIP::precision_ = 1e-5;
bool MOIP::allow_pk_ = true;
unsigned getNumConstraints(IloModel& m)
{
......@@ -390,19 +391,21 @@ void MOIP::define_problem_constraints(void)
if (verbose_) cout << "\t\t>motif " << i << " : " << (c5 == jm1 * C(i, 0)) << endl;
}
// Forbid pseudoknots
if (verbose_) cout << "\t>forbidding pseudoknots..." << endl;
for (size_t u = 0; u < n - 6; u++)
for (size_t v = u + 4; v < n - 1; v++)
if (allowed_basepair(u, v))
for (size_t k = u + 1; k < v; ++k)
for (size_t l = v + 1; l < n; ++l)
if (allowed_basepair(k, l)) {
IloExpr c(env_);
c += y(u, v);
c += y(k, l);
model_.add(c <= 1);
if (verbose_) cout << "\t\t" << (c <= 1) << endl;
}
if (!this->allow_pk_) {
if (verbose_) cout << "\t>forbidding pseudoknots..." << endl;
for (size_t u = 0; u < n - 6; u++)
for (size_t v = u + 4; v < n - 1; v++)
if (allowed_basepair(u, v))
for (size_t k = u + 1; k < v; ++k)
for (size_t l = v + 1; l < n; ++l)
if (allowed_basepair(k, l)) {
IloExpr c(env_);
c += y(u, v);
c += y(k, l);
model_.add(c <= 1);
if (verbose_) cout << "\t\t" << (c <= 1) << endl;
}
}
}
void MOIP::search_between(double lambdaMin, double lambdaMax)
......
......@@ -27,8 +27,9 @@ class MOIP
void forbid_solutions_between(double min, double max);
IloEnv& get_env(void);
static uint obj_function_nbr_; // On what criteria do you want to insert motifs ?
static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
static uint obj_to_solve_; // What objective do you prefer to solve in mono-objective portions of the algorithm ?
static double precision_; // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
static bool allow_pk_; // Wether we forbid pseudoknots (false) or allow them (true)
private:
bool is_undominated_yet(const SecondaryStructure& s);
......
/***
Biominserter, Louis Becquey, nov 2018
louis.becquey@univ-evry.fr
***/
#include <algorithm>
......@@ -67,27 +68,21 @@ int main(int argc, char* argv[])
/* ARGUMENT CHECKING */
po::options_description desc("Options");
desc.add_options()("help,h", "Print the help message")("version", "Print the program version")(
"seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")(
"descfolder,d",
po::value<string>(&motifs_path_name),
"A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")(
"jar3dcsv",
po::value<string>(&motifs_path_name),
"A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")(
"bayespaircsv",
po::value<string>(&motifs_path_name),
"A file containing the output of BayesPairing's search for motifs in the sequence, as produced by "
"test_on_RNAstrand.py")("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")(
"output,o", po::value<string>(&outputName), "A file to summarize the computation results")(
"theta,t",
po::value<float>(&theta_p_threshold)->default_value(0.001),
"Pairing probability threshold to consider or not the possibility of pairing")(
"type",
po::value<int>(&obj_function_nbr),
"If -f is provided, what objective function to use to include motifs: square of motif size in nucleotides like "
desc.add_options()
("help,h", "Print the help message")
("version", "Print the program version")
("seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")
("descfolder,d", po::value<string>(&motifs_path_name), "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")
("jar3dcsv", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
("bayespaircsv", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")
("output,o", po::value<string>(&outputName), "A file to summarize the computation results")
("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing")
("type,f", po::value<int>(&obj_function_nbr), "What objective function to use to include motifs: square of motif size in nucleotides like "
"RNA-MoIP (1), jar3d score (2), motif size + jar3d score + number of components (3), motif size + number of "
"components (4)")("verbose,v", "Print what is happening to stdout");
"components (4)")
("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots")
("verbose,v", "Print what is happening to stdout");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
basename = remove_ext(inputName.c_str(), '.', '/');
......@@ -100,7 +95,6 @@ int main(int argc, char* argv[])
"structures with included known RNA modules."
<< endl
<< "developped by Louis Becquey (louis.becquey@univ-evry.fr), 2019" << endl
<< "NOTE THIS VERSION IS SUPPOSED TO MIMIC RNA MoIP (input data, objective functions, constraints)" << endl
<< endl
<< desc << endl;
return EXIT_SUCCESS;
......@@ -110,6 +104,7 @@ int main(int argc, char* argv[])
return EXIT_SUCCESS;
}
if (vm.count("verbose")) verbose = true;
if (vm.count("disable-pseudoknots")) MOIP::allow_pk_ = false;
if (!vm.count("jar3dcsv") and !vm.count("bayespaircsv") and !vm.count("descfolder")) {
cerr << "\033[31mYou must provide at least --jar3dcsv, --bayespaircsv or --descfolder.\033[0m See --help "
"for more "
......
{
"files.associations": {
"array": "cpp",
"initializer_list": "cpp",
"string_view": "cpp",
"utility": "cpp"
}
}
\ No newline at end of file
\relax
\citation{djelloul_automated_2008}
\citation{petrov_automated_2013}
\citation{reinharz2018mining}
\citation{dirksAlgorithmComputingNucleic2004}
\citation{zirbel_identifying_2015}
\citation{cruz2011sequence}
\citation{petrov_automated_2013}
\@writefile{toc}{\contentsline {section}{\numberline {1}Motivation}{2}}
\@writefile{toc}{\contentsline {section}{\numberline {2}Computation of basepair probabilities}{2}}
\@writefile{toc}{\contentsline {paragraph}{Discussion of that choice}{2}}
\@writefile{toc}{\contentsline {section}{\numberline {3}Detection of potential modules in the sequence input}{2}}
\@writefile{toc}{\contentsline {paragraph}{\textbf {Choice of a module model, choice of a sequence probabilistic model}}{2}}
\citation{lorenz_viennarna_2011}
\citation{sato_ipknot:_2011}
\citation{reinharz_towards_2012}
\citation{legendre_bi-objective_2018}
\@writefile{toc}{\contentsline {paragraph}{\textbf {Detection of the loops in an RNA sequence}}{3}}
\@writefile{toc}{\contentsline {section}{\numberline {4}Formulation of the IP problem}{3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Variables}{3}}
\citation{reinharz_towards_2012}
\newlabel{objectives}{{4.2}{4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Objectives }{4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}9 Constraints to bind them all}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide}{4}}
\newlabel{constraint:1}{{1}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraints to forbid lonely base pairs}{4}}
\newlabel{constraint:2}{{2}{4}}
\newlabel{constraint:3}{{3}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid pairings inside a module component}{4}}
\newlabel{constraint:4}{{4}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid component to overlap}{4}}
\newlabel{constraint:5}{{5}{4}}
\citation{legendre_bi-objective_2018}
\citation{cplex}
\@writefile{toc}{\contentsline {paragraph}{Constraints to respect the structure of large motives ($\{ x\in M \tmspace +\thickmuskip {.2777em} | \tmspace +\thickmuskip {.2777em} \delimiter "026B30D x\delimiter "026B30D \geq 2\}$)}{5}}
\newlabel{constraint:6}{{6}{5}}
\newlabel{constraint:7}{{7}{5}}
\newlabel{constraint:8}{{8}{5}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid a previously found solution}{5}}
\newlabel{constraint:9}{{9}{5}}
\newlabel{methods}{{5}{5}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Methods }{5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Bi-objective algorithm}{5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Solving the IP problem}{5}}
\citation{andronescu2008rna}
\bibstyle{unsrt}
\bibdata{RNA}
\bibcite{djelloul_automated_2008}{1}
\bibcite{petrov_automated_2013}{2}
\bibcite{reinharz2018mining}{3}
\bibcite{dirksAlgorithmComputingNucleic2004}{4}
\bibcite{zirbel_identifying_2015}{5}
\bibcite{cruz2011sequence}{6}
\bibcite{lorenz_viennarna_2011}{7}
\bibcite{sato_ipknot:_2011}{8}
\bibcite{reinharz_towards_2012}{9}
\bibcite{legendre_bi-objective_2018}{10}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Benchmarking of the module inclusion objective functions}{6}}
\newlabel{results}{{6}{6}}
\@writefile{toc}{\contentsline {section}{\numberline {6}Results }{6}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Comparison of the 3 objective functions for motif insertion}{6}}
\newlabel{discussion}{{7}{6}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Discussion }{6}}
\bibcite{cplex}{11}
\bibcite{andronescu2008rna}{12}
This is BibTeX, Version 0.99d (TeX Live 2017/Debian)
Capacity: max_strings=100000, hash_size=100000, hash_prime=85009
The top-level auxiliary file: motif_forcing_constraints.aux
The style file: unsrt.bst
Database file #1: RNA.bib
Warning--entry type for "cplex" isn't style-file defined
--line 570 of file RNA.bib
You've used 12 entries,
1791 wiz_defined-function locations,
524 strings with 6041 characters,
and the built_in function-call counts, 2939 in all, are:
= -- 250
> -- 152
< -- 0
+ -- 55
- -- 43
* -- 257
:= -- 487
add.period$ -- 36
call.type$ -- 12
change.case$ -- 12
chr.to.int$ -- 0
cite$ -- 12
duplicate$ -- 104
empty$ -- 275
format.name$ -- 43
if$ -- 622
int.to.chr$ -- 0
int.to.str$ -- 12
missing$ -- 11
newline$ -- 63
num.names$ -- 12
pop$ -- 17
preamble$ -- 1
purify$ -- 0
quote$ -- 0
skip$ -- 38
stack$ -- 0
substring$ -- 237
swap$ -- 12
text.length$ -- 0
text.prefix$ -- 0
top$ -- 0
type$ -- 0
warning$ -- 0
while$ -- 29
width$ -- 14
write$ -- 133
(There was 1 warning)
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
......@@ -57,14 +57,14 @@
\vspace{5cm}
\hrulefill
\flushright{
\flushright{
\textbf{under the supervision of:\\}
Fariza Tahi, HDR \\
Eric Angel, HDR \\
\textit{AROBAS, IBISC, Paris-Saclay University}\\
~ \\
\includegraphics[height=1cm]{logoIBISC.png} ~ ~
\includegraphics[height=1cm]{minisaclay.png}
Fariza Tahi, HDR \\
Eric Angel, HDR \\
\textit{AROBAS, IBISC, Paris-Saclay University}\\
~ \\
\includegraphics[height=1cm]{logoIBISC.png} ~ ~
\includegraphics[height=1cm]{minisaclay.png}
}
\flushleft
\hrulefill
......@@ -83,13 +83,13 @@
We now have quite consistent RNA module databases (RNA3Dmodule \cite{djelloul_automated_2008}, the RNA Motif Atlas \cite{petrov_automated_2013}, or CaRNAval \cite{reinharz2018mining}).
We call a RNA module the combined description of these points:
\begin{itemize}
\item A particular base-pairing pattern of canonical \& wobble pairing (bi-dimensional information),
\item A particular base-pairing pattern of canonical \& wobble pairing (bi-dimensional information),
\item A particular organisation of non-canonical contacts in space (tri-dimensional information).
\item A sequence or consensus sequence that we know to adopt that particular base-pairing organisation (sequence information), or at least a probabilistic model to predict if a given sequence will fold according to the module.
\end{itemize}
Then, a RNA module is direct data about how a sequence can fold in space. Here, we are interested in using them to predict the bi or tri dimensional structure of an RNA sequence from scratch.
Then, the algorithm we plan to use is the following:
\begin{itemize}
\item Find all possible occurences of known RNA modules in the query sequence by using the sequence probabilistic models of the modules against the query bases,
\item Define constraints on the secondary structure imposed by motives if they would be included
......@@ -130,7 +130,7 @@ This computation should be avoided in the future : It requires the computation o
Unfortunately, to avoid computing it twice, we should reimplement ourselves the algorithms instead of using the actual implementations. We have programs that output the basepair matrix but no structures, and programs that output structures but not the basepair matrix, and we don't have a program that can do both.
\section{Formulation of the IP problem}
This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules.
The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018} and detailed later in section \ref{methods}.
% This problem has been inspired by IPknot's model \cite{sato_ipknot:_2011}, but without the use of what they call \textit{levels}, which are pseudoknot-free
......@@ -138,10 +138,10 @@ The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2
% Levels are mutually exclusive (a base cannot be paired in two different levels). The more levels you use, the more decision variables you have,
% but the more complex pseudoknots you are able to predict. IPknot and BiokoP use 2 levels, to predict simple knots.\\
% \begin{figure}[h]\centering \includegraphics[height=5cm]{levels.png}\end{figure}
\subsection{Variables}
\subsection{Variables}
% Let $m$ be the number of levels that our program uses.\\
Let $n$ be the number of nucleotides in the query RNA sequence $s$.\\
Let $M$ be the set of modules that could be inserted in $s$ (a loop from $s$ has a good JAR3D score against one of the modules).\\
......@@ -151,8 +151,8 @@ As the same module model can be inserted several times in $s$, several different
Let $k_{x,i}$ be the size in nucleotides of that $i$th component of $x$.\\
Let $y^u_v$ be the \textbf{decision boolean variable} indicating that $s[u]$ and $s[v]$ form a canonical base pairing. According to the standard loop model, we always have $v > u + 3$.\\
Let $C^x_i$ be the \textbf{decision boolean variable} indicating that we do insert the $i$th component of module $x$ at position $P_{x,i}$.
Note that a base pair $y^u_v$ is possible if and only if $v>u+3$, and that we do not need to use two variables $y^u_v$ and $y_{vu}$ for the same pair.
Then, we have $\sum_{i=4}^n (n-i)$ decision variables ($\approx \frac{1}{2}n^2$ decision variables) of the form $y^u_v$.
Regarding the $C^x_i$, if we have an average insertion of $\nu$ motives by RNA sequence, the motives having in average $\mu$ components, components that can be inserted in average at $\pi$ different positions in $s$,
......@@ -168,7 +168,7 @@ Let $X$ be the vector of all our decision variables, we define the following obj
\[ f_{1B}(X) = \sum_{x \in M} p(x) \times C^x_1 \]
\[ f_{1C}(X) = \sum_{x \in M} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\]
$$ f_2(X) = \sum_{u<v} p_{uv}\times y^u_v \times I[p_{uv}>\theta], \qquad \qquad
p_{uv} = \sum_{\sigma \in S(s)} y^u_v.p(\sigma | s)$$
p_{uv} = \sum_{\sigma \in S(s)} y^u_v.p(\sigma | s)$$
The different $f_1$ objectives are supposed to maximize the number of inserted motives in $s$,
weighted by their number of components (squared for 1A), or the JAR3D score (B and C).
In $1C$, a penalty is added on the number of nucleotides involved in the looped zone (sum of $k_{x,i}$) to avoid long unpaired zones.
......@@ -183,7 +183,7 @@ Note that \(f_{1A}\) is taken from RNA MoIP \cite{reinharz_towards_2012}, to com
\begin{equation} \label{constraint:1}
\sum_{v<u} y^v_u + \sum_{v>u} y^u_v \leq 1 \qquad\qquad \forall u \in \llbracket 1,n \rrbracket
\end{equation}
\paragraph{Constraints to forbid lonely base pairs} ~
% \begin{equation} \label{constraint:2}
% \sum_{v=u}^n y^{u-1}_v - \sum_{v=u+1}^n y^u_v + \sum_{v=u+2}^n y^{u+1}_v \geq 0 \qquad \qquad \forall u \in \llbracket 1,n\rrbracket
......@@ -208,14 +208,14 @@ Then, this condition sets to zero "lonely decision variables" who have no neighb
(k_{x,i}-2) \; C^x_i + \sum_{u=P_{x,i}+1}^{P_{x,i}+k_{x,i}-2}\left[ \sum_{v>u} y^u_v + \sum_{v<u} y^v_u \right] \leq (k_{x,i} - 2)
\qquad \qquad \forall x \in M, i \in \llbracket 1,\|x\| \rrbracket
\end{equation}
\paragraph{Constraint to forbid component to overlap} ~
\begin{equation} \label{constraint:5}
\sum_{x \in M} \sum_{i=1}^{\|x\|} C^x_i \times I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1]) \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
\end{equation}
$I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1])$ is a booleean value depending on the condition's truth. Then, whatever the nucleotide $u$, it can be part of a module component only once.
% \begin{center}\includegraphics[height=3cm]{component.png}\end{center}
\paragraph{Constraints to respect the structure of large motives ($\{ x\in M \; | \; \|x\| \geq 2\}$)} ~
This constraint ensures that none or all the components of a motif are inserted.
......