Updated documentation

Louis BECQUEY
Commit e2aa1dcc47d068c01508a9a621cf0fc38b067042 e2aa1dcc 1 parent 5f6cb56f
Showing 10 changed files with 207 additions and 35 deletions
cppsrc/MOIP.cpp
doc/.vscode/settings.json
doc/RNA.bib
doc/motif_forcing_constraints.aux
doc/motif_forcing_constraints.blg
doc/motif_forcing_constraints.fdb_latexmk
doc/motif_forcing_constraints.fls
doc/motif_forcing_constraints.log
doc/motif_forcing_constraints.pdf
doc/motif_forcing_constraints.tex
--- a/cppsrc/MOIP.cpp
View file @e2aa1dc
+++ b/cppsrc/MOIP.cpp
View file @e2aa1dc
@@ -308,13 +308,13 @@ void MOIP::define_problem_constraints(void)
         model_.add(C(i, 0) <= c6p);
         if (x.comp.size() == 1)    // This constraint is for multi-component motives.
             continue;
-         for (size_t j = 1; j < x.comp.size(); j++) {
+         for (size_t j = 0; j < x.comp.size()-1; j++) {
             IloExpr c6 = IloExpr(env_);
-             if (allowed_basepair(x.comp[j - 1].pos.second, x.comp[j].pos.first))
-                 c6 += y(x.comp[j - 1].pos.second, x.comp[j].pos.first);
+             if (allowed_basepair(x.comp[j].pos.second, x.comp[j+1].pos.first))
+                 c6 += y(x.comp[j].pos.second, x.comp[j+1].pos.first);
             model_.add(C(i, j) <= c6);
-             cout << "\t\t" << (C(i, j) <= c6) << "\t(" << x.comp[j - 1].pos.second << ',' << x.comp[j].pos.first
-                  << (allowed_basepair(x.comp[j - 1].pos.second, x.comp[j].pos.first) > 0 ? ") is allowed" : ") is not allowed")
+             cout << "\t\t" << (C(i, j) <= c6) << "\t(" << x.comp[j].pos.second << ',' << x.comp[j+1].pos.first
+                  << (allowed_basepair(x.comp[j].pos.second, x.comp[j+1].pos.first) > 0 ? ") is allowed" : ") is not allowed")
                  << endl;
         }
     }
--- a/doc/.vscode/settings.json 0 → 100644
View file @e2aa1dc
+++ b/doc/.vscode/settings.json 0 → 100644
View file @e2aa1dc
+ {
+     "files.associations": {
+         "array": "cpp",
+         "initializer_list": "cpp",
+         "string_view": "cpp",
+         "utility": "cpp"
+     }
+ }
\ No newline at end of file
--- a/doc/RNA.bib
View file @e2aa1dc
+++ b/doc/RNA.bib
View file @e2aa1dc
@@ -565,3 +565,23 @@
 	pages = {7504--7520},
 	file = {PubMed Central Full Text PDF:/nhome/siniac/lbecquey/Zotero/storage/C68JKL5J/Zirbel et al. - 2015 - Identifying novel sequence variants of RNA 3D moti.pdf:application/pdf}
 }
+ 
+ 
+ @software{cplex,
+ 	author =   {IBM ILOG},
+ 	title =    {{CPLEX}: {CPLEX} {Optimizer} (Academic license)},
+ 	version = {12.8},
+ 	howpublished = {\url{https://www.ibm.com/analytics/optimization-modeling-interfaces}},
+ 	year = {2018}
+ }
+ 
+ @article{andronescu2008rna,
+   title={RNA STRAND: the RNA secondary structure and statistical analysis database},
+   author={Andronescu, Mirela and Bereg, Vera and Hoos, Holger H and Condon, Anne},
+   journal={BMC bioinformatics},
+   volume={9},
+   number={1},
+   pages={340},
+   year={2008},
+   publisher={BioMed Central}
+ }
\ No newline at end of file
--- a/doc/motif_forcing_constraints.aux 0 → 100644
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.aux 0 → 100644
View file @e2aa1dc
+ \relax 
+ \citation{djelloul_automated_2008}
+ \citation{petrov_automated_2013}
+ \citation{reinharz2018mining}
+ \citation{dirksAlgorithmComputingNucleic2004}
+ \citation{zirbel_identifying_2015}
+ \citation{cruz2011sequence}
+ \citation{petrov_automated_2013}
+ \@writefile{toc}{\contentsline {section}{\numberline {1}Motivation}{2}}
+ \@writefile{toc}{\contentsline {section}{\numberline {2}Computation of basepair probabilities}{2}}
+ \@writefile{toc}{\contentsline {paragraph}{Discussion of that choice}{2}}
+ \@writefile{toc}{\contentsline {section}{\numberline {3}Detection of potential modules in the sequence input}{2}}
+ \@writefile{toc}{\contentsline {paragraph}{\textbf  {Choice of a module model, choice of a sequence probabilistic model}}{2}}
+ \citation{lorenz_viennarna_2011}
+ \citation{sato_ipknot:_2011}
+ \citation{reinharz_towards_2012}
+ \citation{legendre_bi-objective_2018}
+ \@writefile{toc}{\contentsline {paragraph}{\textbf  {Detection of the loops in an RNA sequence}}{3}}
+ \@writefile{toc}{\contentsline {section}{\numberline {4}Formulation of the IP problem}{3}}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Variables}{3}}
+ \citation{reinharz_towards_2012}
+ \newlabel{objectives}{{4.2}{4}}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Objectives }{4}}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}9 Constraints to bind them all}{4}}
+ \@writefile{toc}{\contentsline {paragraph}{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide}{4}}
+ \newlabel{constraint:1}{{1}{4}}
+ \@writefile{toc}{\contentsline {paragraph}{Constraints to forbid lonely base pairs}{4}}
+ \newlabel{constraint:2}{{2}{4}}
+ \newlabel{constraint:3}{{3}{4}}
+ \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid pairings inside a module component}{4}}
+ \newlabel{constraint:4}{{4}{4}}
+ \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid component to overlap}{4}}
+ \newlabel{constraint:5}{{5}{4}}
+ \citation{legendre_bi-objective_2018}
+ \citation{cplex}
+ \@writefile{toc}{\contentsline {paragraph}{Constraints to respect the structure of large motives ($\{ x\in M \tmspace  +\thickmuskip {.2777em} | \tmspace  +\thickmuskip {.2777em} \delimiter "026B30D x\delimiter "026B30D  \geq 2\}$)}{5}}
+ \newlabel{constraint:6}{{6}{5}}
+ \newlabel{constraint:7}{{7}{5}}
+ \newlabel{constraint:8}{{8}{5}}
+ \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid a previously found solution}{5}}
+ \newlabel{constraint:9}{{9}{5}}
+ \newlabel{methods}{{5}{5}}
+ \@writefile{toc}{\contentsline {section}{\numberline {5}Methods }{5}}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Bi-objective algorithm}{5}}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Solving the IP problem}{5}}
+ \citation{andronescu2008rna}
+ \bibstyle{unsrt}
+ \bibdata{RNA}
+ \bibcite{djelloul_automated_2008}{1}
+ \bibcite{petrov_automated_2013}{2}
+ \bibcite{reinharz2018mining}{3}
+ \bibcite{dirksAlgorithmComputingNucleic2004}{4}
+ \bibcite{zirbel_identifying_2015}{5}
+ \bibcite{cruz2011sequence}{6}
+ \bibcite{lorenz_viennarna_2011}{7}
+ \bibcite{sato_ipknot:_2011}{8}
+ \bibcite{reinharz_towards_2012}{9}
+ \bibcite{legendre_bi-objective_2018}{10}
+ \bibcite{cplex}{11}
+ \bibcite{andronescu2008rna}{12}
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Benchmarking of the module inclusion objective functions}{6}}
+ \newlabel{results}{{6}{6}}
+ \@writefile{toc}{\contentsline {section}{\numberline {6}Results }{6}}
+ \newlabel{discussion}{{7}{6}}
+ \@writefile{toc}{\contentsline {section}{\numberline {7}Discussion }{6}}
--- a/doc/motif_forcing_constraints.blg 0 → 100644
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.blg 0 → 100644
View file @e2aa1dc
+ This is BibTeX, Version 0.99d (TeX Live 2017/Debian)
+ Capacity: max_strings=100000, hash_size=100000, hash_prime=85009
+ The top-level auxiliary file: motif_forcing_constraints.aux
+ The style file: unsrt.bst
+ Database file #1: RNA.bib
+ Warning--entry type for "cplex" isn't style-file defined
+ --line 570 of file RNA.bib
+ You've used 12 entries,
+             1791 wiz_defined-function locations,
+             524 strings with 6041 characters,
+ and the built_in function-call counts, 2939 in all, are:
+ = -- 250
+ > -- 152
+ < -- 0
+ + -- 55
+ - -- 43
+ * -- 257
+ := -- 487
+ add.period$ -- 36
+ call.type$ -- 12
+ change.case$ -- 12
+ chr.to.int$ -- 0
+ cite$ -- 12
+ duplicate$ -- 104
+ empty$ -- 275
+ format.name$ -- 43
+ if$ -- 622
+ int.to.chr$ -- 0
+ int.to.str$ -- 12
+ missing$ -- 11
+ newline$ -- 63
+ num.names$ -- 12
+ pop$ -- 17
+ preamble$ -- 1
+ purify$ -- 0
+ quote$ -- 0
+ skip$ -- 38
+ stack$ -- 0
+ substring$ -- 237
+ swap$ -- 12
+ text.length$ -- 0
+ text.prefix$ -- 0
+ top$ -- 0
+ type$ -- 0
+ warning$ -- 0
+ while$ -- 29
+ width$ -- 14
+ write$ -- 133
+ (There was 1 warning)
--- a/doc/motif_forcing_constraints.fdb_latexmk 0 → 100644
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.fdb_latexmk 0 → 100644
View file @e2aa1dc
--- a/doc/motif_forcing_constraints.fls 0 → 100644
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.fls 0 → 100644
View file @e2aa1dc
--- a/doc/motif_forcing_constraints.log 0 → 100644
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.log 0 → 100644
View file @e2aa1dc
--- a/doc/motif_forcing_constraints.pdf
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.pdf
View file @e2aa1dc
--- a/doc/motif_forcing_constraints.tex
View file @e2aa1dc
+++ b/doc/motif_forcing_constraints.tex
View file @e2aa1dc
@@ -131,7 +131,8 @@ Unfortunately, to avoid computing it twice, we should reimplement ourselves the 
 
 \section{Formulation of the IP problem}
 	
- This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules. The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018}.
+ This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules.
+ The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018} and detailed later in section \ref{methods}.
 % This problem has been inspired by IPknot's model \cite{sato_ipknot:_2011}, but without the use of what they call \textit{levels}, which are pseudoknot-free
 % structures that should be superposed to give the true pseudoknotted secondary structure graph.\\
 % Levels are mutually exclusive (a base cannot be paired in two different levels). The more levels you use, the more decision variables you have, 
@@ -160,7 +161,7 @@ then we need to add, in average, $\nu \times \mu \times \pi$ decision variables 
 Then, we expect having around $\frac{1}{2}n^2+\nu \mu \pi$ decision variables.
 
 \newpage
- \subsection{Objectives}
+ \subsection{Objectives \label{objectives}}
 We have two objectives : Find a structure with correct expected accuracy, and find a structure which includes (large) known modules.
 Let $X$ be the vector of all our decision variables, we define the following objective functions to maximize:
 \[ f_{1A}(X) = \sum_{x \in M} \left[ (\|x\|)^2 \times \sum_{p \in P_{x,1}} C^{x,1}_p \right]\]
@@ -177,7 +178,7 @@ $p_{uv}$ are the base pairing probabilities.
 
 Note that \(f_{1A}\) is taken from RNA MoIP \cite{reinharz_towards_2012}, to compare performance.
 
- \subsection{10 Constraints to bind them all}
+ \subsection{9 Constraints to bind them all}
 \paragraph{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide} ~ 
 \begin{equation} \label{constraint:1}
 	\sum_{v<u} y^v_u + \sum_{v>u} y^u_v \leq 1 \qquad\qquad \forall u \in \llbracket 1,n \rrbracket
@@ -203,45 +204,74 @@ and equation \ref{constraint:3} if $s[v]$ is paired with $s[u<v]$ (a nucleotide 
 	
 \paragraph{Constraint to forbid component to overlap} ~
 \begin{equation} \label{constraint:5}
- 	\sum_{x \in M} \sum_{i=1}^{\|x\|}\sum_{u=P_{x,i}}^{P_{x,i}+k_{x,i}-1} C^x_i \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
+ 	\sum_{x \in M} \sum_{i=1}^{\|x\|} C^x_i \times I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1]) \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
 \end{equation}
- Then, whatever the nucleotide $u$, it can be part of a module component only once. 
- $u$ belongs to a component $x,i$ if and only if $u-k_{x,i}+1 \leq p_{x,i} \leq u$.\\
- \begin{center}\includegraphics[height=3cm]{component.png}\end{center}
+ $I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1])$ is a booleean value depending on the condition's truth. Then, whatever the nucleotide $u$, it can be part of a module component only once.
+ % \begin{center}\includegraphics[height=3cm]{component.png}\end{center}
 	
- \paragraph{Constraints to respect the structure of large motives ($\forall x \in \{ x\in M | \|x\| \geq 2\}$)} ~ 
+ \paragraph{Constraints to respect the structure of large motives ($\{ x\in M \; | \; \|x\| \geq 2\}$)} ~ 
 
- The first two constraints ensure that a component is inserted if and only if the next and the previous one are also inserted somewhere.
- They also force a minimal distance of 3 nucleotides between any two consecutive components of the same module.
+ This constraint ensures that none or all the components of a motif are inserted.
 \begin{equation}\label{constraint:6}
- 	C^x_i \leq \sum_{\substack{p' \in P_{x,i+1}\\ p'>p+k_{x,i}+2}} C^{x,i+1}_{p'}	
- 	\qquad \qquad \forall p \in P_{x,i} \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \llbracket 1,\|x\| \llbracket
+ 	\sum_{i=2}^{\|x\|} C^x_i = (\|x\| - 1) \times C^{x}_{1}	 \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}
 \end{equation}
+ 
+ And then, we force base pairs between the end of a component and the beginning of the next one:
 \begin{equation}\label{constraint:7}
- 	C^x_i \leq \sum_{\substack{p' \in P_{x,i-1}\\ p'<p-k_{x,i-1}-2}} C^{x,i-1}_{p'}	
- 	\qquad \qquad \forall p \in P_{x,i} \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \rrbracket 1,\|x\| \rrbracket
+ 	C^x_1 \leq y^{P_{x,1}}_{P_{x,\|x\|}+k_{x,\|x\|}-1} \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}
 \end{equation}
- We add another one to ensure that all components of any module are inserted the same number of times in $s$:
 \begin{equation}\label{constraint:8}
- 	\sum_{p\in P_{x,1}} C^{x,1}_p - \frac{1}{\|x\|} \sum_{i=1}^{\|x\|} \sum_{p\in P_{x,i}} C^x_i = 0  \qquad \qquad x\in M
+ 	C^x_j \leq y^{P_{x,j}+k_{x,j}-1}_{P_{x,j+1}} \qquad \qquad \forall x \in \{ x\in M \; | \; \|x\| \geq 2\}, \forall j \in \llbracket 1,\|x\| \llbracket
 \end{equation}
- And finally, we force base pairs between the end of a component and the beginning of the next one:
+ 
+ Constraint \ref{constraint:7} binds the first nucleotide of first component to the last one of the last component. 
+ Constraint \ref{constraint:8} binds the last nucleotide of component $j$ to the first of component $j+1$.
+ 
+ \paragraph{Constraint to forbid a previously found solution} ~ 
+ 
+ As several solutions may result in the same values of the two objectives, we can't forbid the algorithm to search twice the same region of the objective landscape.
+ We have to explicitly forbid to find again every found solution.\\
+ We do it by adding iteratively, for every structure $s^*$ found, the following condition :
 \begin{equation}\label{constraint:9}
- 	C^x_i \leq \sum_{p' \in P_{x,i-1}} y^{p'+k_{x,i}-1}_p \qquad \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \rrbracket 1,\|x\| \rrbracket
- \end{equation}
- \begin{equation}\label{constraint:10}
- 	C^x_i \leq \sum_{p' \in P_{x,i+1}} y^{p+k_{x,i}-1}_{p'} \qquad \qquad \forall x \in \{ x\in M | \|x\| \geq 2\}, i \in \llbracket 1,\|x\| \llbracket
+ 	\sum_{y^u_v \in \{ y^u_v  | y^u_v = 1 \text{ in } s^* \}} (1 - y^u_v) + \sum_{y^u_v \in \{ y^u_v  | y^u_v = 0 \text{ in } s^* \}} y^u_v +
+ 	\sum_{C^x_i \in \{ C^x_i  | C^x_i = 1 \text{ in } s^* \}} (1 - C^x_i) + \sum_{C^x_i \in \{ C^x_i  |C^x_i = 0 \text{ in } s^* \}} C^x_i \geq 1
 \end{equation}
 
- \section{Remaning questions}
- 	\begin{itemize}
- 		\item I am not sure how to compute the base-pair probabilities for the MEA model (objective $f_2$). 
- 		If you already have a set of secondary structures $S(s)$, why do you need this program ?
- 		\item In objective $f_1$, weighting the module by the number of components (squared) shows no reason to be the best idea. 
- 		Some exploration is needed. An additional weight given by the pattern-matching step might be an idea, reflecting the probability to see the module here in $s$.
- 		
- 	\end{itemize}
- 	
+ It ensures that at least one of the decision variables differs from $s^*$.
+ 
+ \section{Methods \label{methods}}
+ \subsection{Bi-objective algorithm}
+ This is an adaptation of BiokoP's algorithm \cite{legendre_bi-objective_2018} to gather all the points of the pareto set, removing the $k$-pareto set part.
+ 
+ We start by solving each objective independantly to have a lower and higher bound on each objective. 
+ The two solutions found are considered optimal (higher bound) on one objective, and the worse point of the Pareto set concerning the other objective.
+ 
+ Then, we will iteratively solve the mono-objective problem, but adding as a constraint that the second one has to be included between the bounds.
+ Suppose we decide to iteratively solve objective 1. 
+ The found solutions are getting worse and worse concerning objective 1, but better and better concerning objective 2. 
+ Every time a solution is found with a better objective 2 value, we update our lower bound to search for solutions with objective 2 above this new value.
+ Note that we use weak inequality constraints, as several solutions may have the same values concerning the two objectives and that we want to include them in the Pareto set.
+ 
+ When no more solutions are discovered, the Pareto has been entirely found.
+ 
+ \subsection{Solving the IP problem}
+ We use ILOG CPLEX's \cite{cplex} concert technology to solve the integer linear programming problem. All our decision variables are booleans, and all our constraints are linear.
+ 
+ \subsection{Benchmarking of the module inclusion objective functions}
+ To assess the performance of the objective functions proposed in section \ref{objectives}, we need to chose some performance metrics.
+ We will focus on:
+ \begin{itemize}
+ 	\item wether the native secondary structure of the proposed RNA sequence exists in the returned solutions (the pareto set),
+ 	\item the number of solutions returned (size of the Pareto set).
+ \end{itemize}
+ 
+ The performance is assessed on the structures taken from the RNA STRAND database \cite{andronescu2008rna}, after a simple preprocessing to remove pseudobases.
+ 
+ \section{Results \label{results}}
+ 
+ \section{Discussion \label{discussion}}
+ 
+ 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % \newpage
 \bibliographystyle{unsrt}