Louis BECQUEY

Updated documentation

......@@ -308,13 +308,13 @@ void MOIP::define_problem_constraints(void)
model_.add(C(i, 0) <= c6p);
if (x.comp.size() == 1) // This constraint is for multi-component motives.
continue;
for (size_t j = 1; j < x.comp.size(); j++) {
for (size_t j = 0; j < x.comp.size()-1; j++) {
IloExpr c6 = IloExpr(env_);
if (allowed_basepair(x.comp[j - 1].pos.second, x.comp[j].pos.first))
c6 += y(x.comp[j - 1].pos.second, x.comp[j].pos.first);
if (allowed_basepair(x.comp[j].pos.second, x.comp[j+1].pos.first))
c6 += y(x.comp[j].pos.second, x.comp[j+1].pos.first);
model_.add(C(i, j) <= c6);
cout << "\t\t" << (C(i, j) <= c6) << "\t(" << x.comp[j - 1].pos.second << ',' << x.comp[j].pos.first
<< (allowed_basepair(x.comp[j - 1].pos.second, x.comp[j].pos.first) > 0 ? ") is allowed" : ") is not allowed")
cout << "\t\t" << (C(i, j) <= c6) << "\t(" << x.comp[j].pos.second << ',' << x.comp[j+1].pos.first
<< (allowed_basepair(x.comp[j].pos.second, x.comp[j+1].pos.first) > 0 ? ") is allowed" : ") is not allowed")
<< endl;
}
}
......
{
"files.associations": {
"array": "cpp",
"initializer_list": "cpp",
"string_view": "cpp",
"utility": "cpp"
}
}
\ No newline at end of file
......@@ -565,3 +565,23 @@
pages = {7504--7520},
file = {PubMed Central Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/C68JKL5J/Zirbel et al. - 2015 - Identifying novel sequence variants of RNA 3D moti.pdf:application/pdf}
}
@software{cplex,
author = {IBM ILOG},
title = {{CPLEX}: {CPLEX} {Optimizer} (Academic license)},
version = {12.8},
howpublished = {\url{https://www.ibm.com/analytics/optimization-modeling-interfaces}},
year = {2018}
}
@article{andronescu2008rna,
title={RNA STRAND: the RNA secondary structure and statistical analysis database},
author={Andronescu, Mirela and Bereg, Vera and Hoos, Holger H and Condon, Anne},
journal={BMC bioinformatics},
volume={9},
number={1},
pages={340},
year={2008},
publisher={BioMed Central}
}
\ No newline at end of file
......
\relax
\citation{djelloul_automated_2008}
\citation{petrov_automated_2013}
\citation{reinharz2018mining}
\citation{dirksAlgorithmComputingNucleic2004}
\citation{zirbel_identifying_2015}
\citation{cruz2011sequence}
\citation{petrov_automated_2013}
\@writefile{toc}{\contentsline {section}{\numberline {1}Motivation}{2}}
\@writefile{toc}{\contentsline {section}{\numberline {2}Computation of basepair probabilities}{2}}
\@writefile{toc}{\contentsline {paragraph}{Discussion of that choice}{2}}
\@writefile{toc}{\contentsline {section}{\numberline {3}Detection of potential modules in the sequence input}{2}}
\@writefile{toc}{\contentsline {paragraph}{\textbf {Choice of a module model, choice of a sequence probabilistic model}}{2}}
\citation{lorenz_viennarna_2011}
\citation{sato_ipknot:_2011}
\citation{reinharz_towards_2012}
\citation{legendre_bi-objective_2018}
\@writefile{toc}{\contentsline {paragraph}{\textbf {Detection of the loops in an RNA sequence}}{3}}
\@writefile{toc}{\contentsline {section}{\numberline {4}Formulation of the IP problem}{3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Variables}{3}}
\citation{reinharz_towards_2012}
\newlabel{objectives}{{4.2}{4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Objectives }{4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}9 Constraints to bind them all}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide}{4}}
\newlabel{constraint:1}{{1}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraints to forbid lonely base pairs}{4}}
\newlabel{constraint:2}{{2}{4}}
\newlabel{constraint:3}{{3}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid pairings inside a module component}{4}}
\newlabel{constraint:4}{{4}{4}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid component to overlap}{4}}
\newlabel{constraint:5}{{5}{4}}
\citation{legendre_bi-objective_2018}
\citation{cplex}
\@writefile{toc}{\contentsline {paragraph}{Constraints to respect the structure of large motives ($\{ x\in M \tmspace +\thickmuskip {.2777em} | \tmspace +\thickmuskip {.2777em} \delimiter "026B30D x\delimiter "026B30D \geq 2\}$)}{5}}
\newlabel{constraint:6}{{6}{5}}
\newlabel{constraint:7}{{7}{5}}
\newlabel{constraint:8}{{8}{5}}
\@writefile{toc}{\contentsline {paragraph}{Constraint to forbid a previously found solution}{5}}
\newlabel{constraint:9}{{9}{5}}
\newlabel{methods}{{5}{5}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Methods }{5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Bi-objective algorithm}{5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Solving the IP problem}{5}}
\citation{andronescu2008rna}
\bibstyle{unsrt}
\bibdata{RNA}
\bibcite{djelloul_automated_2008}{1}
\bibcite{petrov_automated_2013}{2}
\bibcite{reinharz2018mining}{3}
\bibcite{dirksAlgorithmComputingNucleic2004}{4}
\bibcite{zirbel_identifying_2015}{5}
\bibcite{cruz2011sequence}{6}
\bibcite{lorenz_viennarna_2011}{7}
\bibcite{sato_ipknot:_2011}{8}
\bibcite{reinharz_towards_2012}{9}
\bibcite{legendre_bi-objective_2018}{10}
\bibcite{cplex}{11}
\bibcite{andronescu2008rna}{12}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Benchmarking of the module inclusion objective functions}{6}}
\newlabel{results}{{6}{6}}
\@writefile{toc}{\contentsline {section}{\numberline {6}Results }{6}}
\newlabel{discussion}{{7}{6}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Discussion }{6}}
This is BibTeX, Version 0.99d (TeX Live 2017/Debian)
Capacity: max_strings=100000, hash_size=100000, hash_prime=85009
The top-level auxiliary file: motif_forcing_constraints.aux
The style file: unsrt.bst
Database file #1: RNA.bib
Warning--entry type for "cplex" isn't style-file defined
--line 570 of file RNA.bib
You've used 12 entries,
1791 wiz_defined-function locations,
524 strings with 6041 characters,
and the built_in function-call counts, 2939 in all, are:
= -- 250
> -- 152
< -- 0
+ -- 55
- -- 43
* -- 257
:= -- 487
add.period$ -- 36
call.type$ -- 12
change.case$ -- 12
chr.to.int$ -- 0
cite$ -- 12
duplicate$ -- 104
empty$ -- 275
format.name$ -- 43
if$ -- 622
int.to.chr$ -- 0
int.to.str$ -- 12
missing$ -- 11
newline$ -- 63
num.names$ -- 12
pop$ -- 17
preamble$ -- 1
purify$ -- 0
quote$ -- 0
skip$ -- 38
stack$ -- 0
substring$ -- 237
swap$ -- 12
text.length$ -- 0
text.prefix$ -- 0
top$ -- 0
type$ -- 0
warning$ -- 0
while$ -- 29
width$ -- 14
write$ -- 133
(There was 1 warning)
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
......@@ -131,7 +131,8 @@ Unfortunately, to avoid computing it twice, we should reimplement ourselves the
\section{Formulation of the IP problem}
This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules. The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018}.
This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules.
The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018} and detailed later in section \ref{methods}.
% This problem has been inspired by IPknot's model \cite{sato_ipknot:_2011}, but without the use of what they call \textit{levels}, which are pseudoknot-free
% structures that should be superposed to give the true pseudoknotted secondary structure graph.\\
% Levels are mutually exclusive (a base cannot be paired in two different levels). The more levels you use, the more decision variables you have,
......@@ -160,7 +161,7 @@ then we need to add, in average, $\nu \times \mu \times \pi$ decision variables
Then, we expect having around $\frac{1}{2}n^2+\nu \mu \pi$ decision variables.
\newpage
\subsection{Objectives}
\subsection{Objectives \label{objectives}}
We have two objectives : Find a structure with correct expected accuracy, and find a structure which includes (large) known modules.
Let $X$ be the vector of all our decision variables, we define the following objective functions to maximize:
\[ f_{1A}(X) = \sum_{x \in M} \left[ (\|x\|)^2 \times \sum_{p \in P_{x,1}} C^{x,1}_p \right]\]
......@@ -177,7 +178,7 @@ $p_{uv}$ are the base pairing probabilities.
Note that \(f_{1A}\) is taken from RNA MoIP \cite{reinharz_towards_2012}, to compare performance.
\subsection{10 Constraints to bind them all}
\subsection{9 Constraints to bind them all}
\paragraph{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide} ~
\begin{equation} \label{constraint:1}
\sum_{v<u} y^v_u + \sum_{v>u} y^u_v \leq 1 \qquad\qquad \forall u \in \llbracket 1,n \rrbracket
......@@ -203,45 +204,74 @@ and equation \ref{constraint:3} if $s[v]$ is paired with $s[u<v]$ (a nucleotide
\paragraph{Constraint to forbid component to overlap} ~
\begin{equation} \label{constraint:5}
\sum_{x \in M} \sum_{i=1}^{\|x\|}\sum_{u=P_{x,i}}^{P_{x,i}+k_{x,i}-1} C^x_i \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
\sum_{x \in M} \sum_{i=1}^{\|x\|} C^x_i \times I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1]) \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
\end{equation}
Then, whatever the nucleotide $u$, it can be part of a module component only once.
$u$ belongs to a component $x,i$ if and only if $u-k_{x,i}+1 \leq p_{x,i} \leq u$.\\
\begin{center}\includegraphics[height=3cm]{component.png}\end{center}
$I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1])$ is a booleean value depending on the condition's truth. Then, whatever the nucleotide $u$, it can be part of a module component only once.
% \begin{center}\includegraphics[height=3cm]{component.png}\end{center}
\paragraph{Constraints to respect the structure of large motives ($\forall x \in \{ x\in M | \|x\| \geq 2\}$)} ~
\paragraph{Constraints to respect the structure of large motives ($\{ x\in M \; | \; \|x\| \geq 2\}$)} ~
The first two constraints ensure that a component is inserted if and only if the next and the previous one are also inserted somewhere.
They also force a minimal distance of 3 nucleotides between any two consecutive components of the same module.
This constraint ensures that none or all the components of a motif are inserted.
\begin{equation}\label{constraint:6}
C^x_i \leq \sum_{\substack{p' \in P_{x,i+1}\\ p'>p+k_{x,i}+2}} C^{x,i+1}_{p'}
\qquad \qquad \forall p \in P_{x,i} \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \llbracket 1,\|x\| \llbracket
\sum_{i=2}^{\|x\|} C^x_i = (\|x\| - 1) \times C^{x}_{1} \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}
\end{equation}
And then, we force base pairs between the end of a component and the beginning of the next one:
\begin{equation}\label{constraint:7}
C^x_i \leq \sum_{\substack{p' \in P_{x,i-1}\\ p'<p-k_{x,i-1}-2}} C^{x,i-1}_{p'}
\qquad \qquad \forall p \in P_{x,i} \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \rrbracket 1,\|x\| \rrbracket
C^x_1 \leq y^{P_{x,1}}_{P_{x,\|x\|}+k_{x,\|x\|}-1} \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}
\end{equation}
We add another one to ensure that all components of any module are inserted the same number of times in $s$:
\begin{equation}\label{constraint:8}
\sum_{p\in P_{x,1}} C^{x,1}_p - \frac{1}{\|x\|} \sum_{i=1}^{\|x\|} \sum_{p\in P_{x,i}} C^x_i = 0 \qquad \qquad x\in M
C^x_j \leq y^{P_{x,j}+k_{x,j}-1}_{P_{x,j+1}} \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}, \forall j \in \llbracket 1,\|x\| \llbracket
\end{equation}
And finally, we force base pairs between the end of a component and the beginning of the next one:
Constraint \ref{constraint:7} binds the first nucleotide of first component to the last one of the last component.
Constraint \ref{constraint:8} binds the last nucleotide of component $j$ to the first of component $j+1$.
\paragraph{Constraint to forbid a previously found solution} ~
As several solutions may result in the same values of the two objectives, we can't forbid the algorithm to search twice the same region of the objective landscape.
We have to explicitly forbid to find again every found solution.\\
We do it by adding iteratively, for every structure $s^*$ found, the following condition :
\begin{equation}\label{constraint:9}
C^x_i \leq \sum_{p' \in P_{x,i-1}} y^{p'+k_{x,i}-1}_p \qquad \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \rrbracket 1,\|x\| \rrbracket
\end{equation}
\begin{equation}\label{constraint:10}
C^x_i \leq \sum_{p' \in P_{x,i+1}} y^{p+k_{x,i}-1}_{p'} \qquad \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \llbracket 1,\|x\| \llbracket
\sum_{y^u_v \in \{ y^u_v | y^u_v = 1 \text{ in } s^* \}} (1 - y^u_v) + \sum_{y^u_v \in \{ y^u_v | y^u_v = 0 \text{ in } s^* \}} y^u_v +
\sum_{C^x_i \in \{ C^x_i | C^x_i = 1 \text{ in } s^* \}} (1 - C^x_i) + \sum_{C^x_i \in \{ C^x_i |C^x_i = 0 \text{ in } s^* \}} C^x_i \geq 1
\end{equation}
\section{Remaning questions}
\begin{itemize}
\item I am not sure how to compute the base-pair probabilities for the MEA model (objective $f_2$).
If you already have a set of secondary structures $S(s)$, why do you need this program ?
\item In objective $f_1$, weighting the module by the number of components (squared) shows no reason to be the best idea.
Some exploration is needed. An additional weight given by the pattern-matching step might be an idea, reflecting the probability to see the module here in $s$.
\end{itemize}
It ensures that at least one of the decision variables differs from $s^*$.
\section{Methods \label{methods}}
\subsection{Bi-objective algorithm}
This is an adaptation of BiokoP's algorithm \cite{legendre_bi-objective_2018} to gather all the points of the pareto set, removing the $k$-pareto set part.
We start by solving each objective independantly to have a lower and higher bound on each objective.
The two solutions found are considered optimal (higher bound) on one objective, and the worse point of the Pareto set concerning the other objective.
Then, we will iteratively solve the mono-objective problem, but adding as a constraint that the second one has to be included between the bounds.
Suppose we decide to iteratively solve objective 1.
The found solutions are getting worse and worse concerning objective 1, but better and better concerning objective 2.
Every time a solution is found with a better objective 2 value, we update our lower bound to search for solutions with objective 2 above this new value.
Note that we use weak inequality constraints, as several solutions may have the same values concerning the two objectives and that we want to include them in the Pareto set.
When no more solutions are discovered, the Pareto has been entirely found.
\subsection{Solving the IP problem}
We use ILOG CPLEX's \cite{cplex} concert technology to solve the integer linear programming problem. All our decision variables are booleans, and all our constraints are linear.
\subsection{Benchmarking of the module inclusion objective functions}
To assess the performance of the objective functions proposed in section \ref{objectives}, we need to chose some performance metrics.
We will focus on:
\begin{itemize}
\item wether the native secondary structure of the proposed RNA sequence exists in the returned solutions (the pareto set),
\item the number of solutions returned (size of the Pareto set).
\end{itemize}
The performance is assessed on the structures taken from the RNA STRAND database \cite{andronescu2008rna}, after a simple preprocessing to remove pseudobases.
\section{Results \label{results}}
\section{Discussion \label{discussion}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \newpage
\bibliographystyle{unsrt}
......