Builtin option to allow/forbid pseudoknots

Louis BECQUEY
Commit 06209dec59df13cc9e5a0a32a27a45cb0b80c8bd 06209dec 1 parent 0d8a505e
Showing 12 changed files with 56 additions and 179 deletions
.gitignore
cppsrc/MOIP.cpp
cppsrc/MOIP.h
cppsrc/biominserter.cpp
doc/.vscode/settings.json
doc/motif_forcing_constraints.aux
doc/motif_forcing_constraints.blg
doc/motif_forcing_constraints.fdb_latexmk
doc/motif_forcing_constraints.fls
doc/motif_forcing_constraints.log
doc/motif_forcing_constraints.pdf
doc/motif_forcing_constraints.tex
--- a/.gitignore
View file @06209de
+++ b/.gitignore
View file @06209de
 EditMe
 rna1999.dG
 .vscode/*
+ .vscode
 
 # LaTeX temporary files
 doc/*.toc
--- a/cppsrc/MOIP.cpp
View file @06209de
+++ b/cppsrc/MOIP.cpp
View file @06209de
@@ -20,6 +20,7 @@ using std::vector;
 uint   MOIP::obj_function_nbr_ = 1;
 uint   MOIP::obj_to_solve_     = 1;
 double MOIP::precision_        = 1e-5;
+ bool   MOIP::allow_pk_         = true;
 
 unsigned getNumConstraints(IloModel& m)
 {
@@ -390,19 +391,21 @@ void MOIP::define_problem_constraints(void)
         if (verbose_) cout << "\t\t>motif " << i << " : " << (c5 == jm1 * C(i, 0)) << endl;
     }
     // Forbid pseudoknots
-     if (verbose_) cout << "\t>forbidding pseudoknots..." << endl;
-     for (size_t u = 0; u < n - 6; u++)
-         for (size_t v = u + 4; v < n - 1; v++)
-             if (allowed_basepair(u, v))
-                 for (size_t k = u + 1; k < v; ++k)
-                     for (size_t l = v + 1; l < n; ++l)
-                         if (allowed_basepair(k, l)) {
-                             IloExpr c(env_);
-                             c += y(u, v);
-                             c += y(k, l);
-                             model_.add(c <= 1);
-                             if (verbose_) cout << "\t\t" << (c <= 1) << endl;
-                         }
+     if (!this->allow_pk_) {
+         if (verbose_) cout << "\t>forbidding pseudoknots..." << endl;
+         for (size_t u = 0; u < n - 6; u++)
+             for (size_t v = u + 4; v < n - 1; v++)
+                 if (allowed_basepair(u, v))
+                     for (size_t k = u + 1; k < v; ++k)
+                         for (size_t l = v + 1; l < n; ++l)
+                             if (allowed_basepair(k, l)) {
+                                 IloExpr c(env_);
+                                 c += y(u, v);
+                                 c += y(k, l);
+                                 model_.add(c <= 1);
+                                 if (verbose_) cout << "\t\t" << (c <= 1) << endl;
+                             }
+     }
 }
 
 void MOIP::search_between(double lambdaMin, double lambdaMax)
--- a/cppsrc/MOIP.h
View file @06209de
+++ b/cppsrc/MOIP.h
View file @06209de
@@ -27,8 +27,9 @@ class MOIP
     void                      forbid_solutions_between(double min, double max);
     IloEnv&                   get_env(void);
     static uint               obj_function_nbr_;    // On what criteria do you want to insert motifs ?
-     static uint obj_to_solve_;    // What objective do you prefer to solve in mono-objective portions of the algorithm ?
-     static double precision_;    // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
+     static uint obj_to_solve_;  // What objective do you prefer to solve in mono-objective portions of the algorithm ?
+     static double precision_;   // decimals to keep in objective values, to avoid numerical issues. otherwise, solution with objective 5.0000000009 dominates solution with 5.0 =(
+     static bool allow_pk_;      // Wether we forbid pseudoknots (false) or allow them (true)
 
     private:
     bool   is_undominated_yet(const SecondaryStructure& s);
--- a/cppsrc/biominserter.cpp
View file @06209de
+++ b/cppsrc/biominserter.cpp
View file @06209de
 /***
         Biominserter, Louis Becquey, nov 2018
+         louis.becquey@univ-evry.fr
 ***/
 
 #include <algorithm>
@@ -67,27 +68,21 @@ int main(int argc, char* argv[])
     /*  ARGUMENT CHECKING  */
 
     po::options_description desc("Options");
-     desc.add_options()("help,h", "Print the help message")("version", "Print the program version")(
-     "seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")(
-     "descfolder,d",
-     po::value<string>(&motifs_path_name),
-     "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")(
-     "jar3dcsv",
-     po::value<string>(&motifs_path_name),
-     "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")(
-     "bayespaircsv",
-     po::value<string>(&motifs_path_name),
-     "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by "
-     "test_on_RNAstrand.py")("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")(
-     "output,o", po::value<string>(&outputName), "A file to summarize the computation results")(
-     "theta,t",
-     po::value<float>(&theta_p_threshold)->default_value(0.001),
-     "Pairing probability threshold to consider or not the possibility of pairing")(
-     "type",
-     po::value<int>(&obj_function_nbr),
-     "If -f is provided, what objective function to use to include motifs: square of motif size in nucleotides like "
+     desc.add_options()
+     ("help,h", "Print the help message")
+     ("version", "Print the program version")
+     ("seq,s", po::value<string>(&inputName)->required(), "Fasta file containing the RNA sequence")
+     ("descfolder,d", po::value<string>(&motifs_path_name), "A folder containing modules in .desc format, as produced by Djelloul & Denise's catalog program")
+     ("jar3dcsv", po::value<string>(&motifs_path_name), "A file containing the output of JAR3D's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
+     ("bayespaircsv", po::value<string>(&motifs_path_name), "A file containing the output of BayesPairing's search for motifs in the sequence, as produced by test_on_RNAstrand.py")
+     ("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(1), "Objective to solve in the mono-objective portions of the algorithm")
+     ("output,o", po::value<string>(&outputName), "A file to summarize the computation results")
+     ("theta,t", po::value<float>(&theta_p_threshold)->default_value(0.001), "Pairing probability threshold to consider or not the possibility of pairing")
+     ("type,f", po::value<int>(&obj_function_nbr), "What objective function to use to include motifs: square of motif size in nucleotides like "
     "RNA-MoIP (1), jar3d score (2), motif size + jar3d score + number of components (3), motif size + number of "
-     "components (4)")("verbose,v", "Print what is happening to stdout");
+     "components (4)")
+     ("disable-pseudoknots,n", "Add constraints forbidding the formation of pseudoknots")
+     ("verbose,v", "Print what is happening to stdout");
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, desc), vm);
     basename = remove_ext(inputName.c_str(), '.', '/');
@@ -100,7 +95,6 @@ int main(int argc, char* argv[])
                     "structures with included known RNA modules."
                  << endl
                  << "developped by Louis Becquey (louis.becquey@univ-evry.fr), 2019" << endl
-                  << "NOTE THIS VERSION IS SUPPOSED TO MIMIC RNA MoIP (input data, objective functions, constraints)" << endl
                  << endl
                  << desc << endl;
             return EXIT_SUCCESS;
@@ -110,6 +104,7 @@ int main(int argc, char* argv[])
             return EXIT_SUCCESS;
         }
         if (vm.count("verbose")) verbose = true;
+         if (vm.count("disable-pseudoknots")) MOIP::allow_pk_ = false;
         if (!vm.count("jar3dcsv") and !vm.count("bayespaircsv") and !vm.count("descfolder")) {
             cerr << "\033[31mYou must provide at least --jar3dcsv, --bayespaircsv or --descfolder.\033[0m See --help "
                     "for more "
--- a/doc/.vscode/settings.json deleted 100644 → 0
View file @0d8a505
+++ b/doc/.vscode/settings.json deleted 100644 → 0
View file @0d8a505
- {
-     "files.associations": {
-         "array": "cpp",
-         "initializer_list": "cpp",
-         "string_view": "cpp",
-         "utility": "cpp"
-     }
- }
\ No newline at end of file
--- a/doc/motif_forcing_constraints.aux deleted 100644 → 0
View file @0d8a505
+++ b/doc/motif_forcing_constraints.aux deleted 100644 → 0
View file @0d8a505
- \relax 
- \citation{djelloul_automated_2008}
- \citation{petrov_automated_2013}
- \citation{reinharz2018mining}
- \citation{dirksAlgorithmComputingNucleic2004}
- \citation{zirbel_identifying_2015}
- \citation{cruz2011sequence}
- \citation{petrov_automated_2013}
- \@writefile{toc}{\contentsline {section}{\numberline {1}Motivation}{2}}
- \@writefile{toc}{\contentsline {section}{\numberline {2}Computation of basepair probabilities}{2}}
- \@writefile{toc}{\contentsline {paragraph}{Discussion of that choice}{2}}
- \@writefile{toc}{\contentsline {section}{\numberline {3}Detection of potential modules in the sequence input}{2}}
- \@writefile{toc}{\contentsline {paragraph}{\textbf  {Choice of a module model, choice of a sequence probabilistic model}}{2}}
- \citation{lorenz_viennarna_2011}
- \citation{sato_ipknot:_2011}
- \citation{reinharz_towards_2012}
- \citation{legendre_bi-objective_2018}
- \@writefile{toc}{\contentsline {paragraph}{\textbf  {Detection of the loops in an RNA sequence}}{3}}
- \@writefile{toc}{\contentsline {section}{\numberline {4}Formulation of the IP problem}{3}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Variables}{3}}
- \citation{reinharz_towards_2012}
- \newlabel{objectives}{{4.2}{4}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Objectives }{4}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}9 Constraints to bind them all}{4}}
- \@writefile{toc}{\contentsline {paragraph}{Constraint to ensure there only is 0 or 1 canonical pairing by nucleotide}{4}}
- \newlabel{constraint:1}{{1}{4}}
- \@writefile{toc}{\contentsline {paragraph}{Constraints to forbid lonely base pairs}{4}}
- \newlabel{constraint:2}{{2}{4}}
- \newlabel{constraint:3}{{3}{4}}
- \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid pairings inside a module component}{4}}
- \newlabel{constraint:4}{{4}{4}}
- \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid component to overlap}{4}}
- \newlabel{constraint:5}{{5}{4}}
- \citation{legendre_bi-objective_2018}
- \citation{cplex}
- \@writefile{toc}{\contentsline {paragraph}{Constraints to respect the structure of large motives ($\{ x\in M \tmspace  +\thickmuskip {.2777em} | \tmspace  +\thickmuskip {.2777em} \delimiter "026B30D x\delimiter "026B30D  \geq 2\}$)}{5}}
- \newlabel{constraint:6}{{6}{5}}
- \newlabel{constraint:7}{{7}{5}}
- \newlabel{constraint:8}{{8}{5}}
- \@writefile{toc}{\contentsline {paragraph}{Constraint to forbid a previously found solution}{5}}
- \newlabel{constraint:9}{{9}{5}}
- \newlabel{methods}{{5}{5}}
- \@writefile{toc}{\contentsline {section}{\numberline {5}Methods }{5}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Bi-objective algorithm}{5}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Solving the IP problem}{5}}
- \citation{andronescu2008rna}
- \bibstyle{unsrt}
- \bibdata{RNA}
- \bibcite{djelloul_automated_2008}{1}
- \bibcite{petrov_automated_2013}{2}
- \bibcite{reinharz2018mining}{3}
- \bibcite{dirksAlgorithmComputingNucleic2004}{4}
- \bibcite{zirbel_identifying_2015}{5}
- \bibcite{cruz2011sequence}{6}
- \bibcite{lorenz_viennarna_2011}{7}
- \bibcite{sato_ipknot:_2011}{8}
- \bibcite{reinharz_towards_2012}{9}
- \bibcite{legendre_bi-objective_2018}{10}
- \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Benchmarking of the module inclusion objective functions}{6}}
- \newlabel{results}{{6}{6}}
- \@writefile{toc}{\contentsline {section}{\numberline {6}Results }{6}}
- \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Comparison of the 3 objective functions for motif insertion}{6}}
- \newlabel{discussion}{{7}{6}}
- \@writefile{toc}{\contentsline {section}{\numberline {7}Discussion }{6}}
- \bibcite{cplex}{11}
- \bibcite{andronescu2008rna}{12}
--- a/doc/motif_forcing_constraints.blg deleted 100644 → 0
View file @0d8a505
+++ b/doc/motif_forcing_constraints.blg deleted 100644 → 0
View file @0d8a505
- This is BibTeX, Version 0.99d (TeX Live 2017/Debian)
- Capacity: max_strings=100000, hash_size=100000, hash_prime=85009
- The top-level auxiliary file: motif_forcing_constraints.aux
- The style file: unsrt.bst
- Database file #1: RNA.bib
- Warning--entry type for "cplex" isn't style-file defined
- --line 570 of file RNA.bib
- You've used 12 entries,
-             1791 wiz_defined-function locations,
-             524 strings with 6041 characters,
- and the built_in function-call counts, 2939 in all, are:
- = -- 250
- > -- 152
- < -- 0
- + -- 55
- - -- 43
- * -- 257
- := -- 487
- add.period$ -- 36
- call.type$ -- 12
- change.case$ -- 12
- chr.to.int$ -- 0
- cite$ -- 12
- duplicate$ -- 104
- empty$ -- 275
- format.name$ -- 43
- if$ -- 622
- int.to.chr$ -- 0
- int.to.str$ -- 12
- missing$ -- 11
- newline$ -- 63
- num.names$ -- 12
- pop$ -- 17
- preamble$ -- 1
- purify$ -- 0
- quote$ -- 0
- skip$ -- 38
- stack$ -- 0
- substring$ -- 237
- swap$ -- 12
- text.length$ -- 0
- text.prefix$ -- 0
- top$ -- 0
- type$ -- 0
- warning$ -- 0
- while$ -- 29
- width$ -- 14
- write$ -- 133
- (There was 1 warning)
--- a/doc/motif_forcing_constraints.fdb_latexmk deleted 100644 → 0
View file @0d8a505
+++ b/doc/motif_forcing_constraints.fdb_latexmk deleted 100644 → 0
View file @0d8a505
--- a/doc/motif_forcing_constraints.fls deleted 100644 → 0
View file @0d8a505
+++ b/doc/motif_forcing_constraints.fls deleted 100644 → 0
View file @0d8a505
--- a/doc/motif_forcing_constraints.log deleted 100644 → 0
View file @0d8a505
+++ b/doc/motif_forcing_constraints.log deleted 100644 → 0
View file @0d8a505
--- a/doc/motif_forcing_constraints.pdf
View file @06209de
+++ b/doc/motif_forcing_constraints.pdf
View file @06209de
--- a/doc/motif_forcing_constraints.tex
View file @06209de
+++ b/doc/motif_forcing_constraints.tex
View file @06209de
@@ -57,14 +57,14 @@
 	\vspace{5cm}
 	
 	\hrulefill
- 	\flushright{		
+ 	\flushright{
 		\textbf{under the supervision of:\\}
- 			Fariza Tahi, HDR \\
- 			Eric Angel, HDR \\
- 			\textit{AROBAS, IBISC, Paris-Saclay University}\\
- 			~ \\
- 			\includegraphics[height=1cm]{logoIBISC.png} ~ ~ 
- 			\includegraphics[height=1cm]{minisaclay.png}
+ 		Fariza Tahi, HDR \\
+ 		Eric Angel, HDR \\
+ 		\textit{AROBAS, IBISC, Paris-Saclay University}\\
+ 		~ \\
+ 		\includegraphics[height=1cm]{logoIBISC.png} ~ ~ 
+ 		\includegraphics[height=1cm]{minisaclay.png}
 	}
 	\flushleft
 	\hrulefill
@@ -83,13 +83,13 @@
 We now have quite consistent RNA module databases (RNA3Dmodule \cite{djelloul_automated_2008}, the RNA Motif Atlas \cite{petrov_automated_2013}, or CaRNAval \cite{reinharz2018mining}). 
 We call a RNA module the combined description of these points:
 \begin{itemize}
- 	\item A particular base-pairing pattern of canonical \& wobble pairing (bi-dimensional information),
+ \item A particular base-pairing pattern of canonical \& wobble pairing (bi-dimensional information),
 	\item A particular organisation of non-canonical contacts in space (tri-dimensional information).
 	\item A sequence or consensus sequence that we know to adopt that particular base-pairing organisation (sequence information), or at least a probabilistic model to predict if a given sequence will fold according to the module.
 \end{itemize}
 Then, a RNA module is direct data about how a sequence can fold in space. Here, we are interested in using them to predict the bi or tri dimensional structure of an RNA sequence from scratch. 
 Then, the algorithm we plan to use is the following:
- 	
+ 
 \begin{itemize}
 	\item Find all possible occurences of known RNA modules in the query sequence by using the sequence probabilistic models of the modules against the query bases,
 	\item Define constraints on the secondary structure imposed by motives if they would be included
@@ -130,7 +130,7 @@ This computation should be avoided in the future : It requires the computation o
 Unfortunately, to avoid computing it twice, we should reimplement ourselves the algorithms instead of using the actual implementations. We have programs that output the basepair matrix but no structures, and programs that output structures but not the basepair matrix, and we don't have a program that can do both.
 
 \section{Formulation of the IP problem}
- 	
+ 
 This formulation is an improved merge between the IPknot \cite{sato_ipknot:_2011} formulation to fold RNAs with pseudoknots (but without "levels") and RNAMoIP's one \cite{reinharz_towards_2012} to include modules.
 The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2018} and detailed later in section \ref{methods}.
 % This problem has been inspired by IPknot's model \cite{sato_ipknot:_2011}, but without the use of what they call \textit{levels}, which are pseudoknot-free
@@ -138,10 +138,10 @@ The bi-objective algorithm is inspired from BiokoP \cite{legendre_bi-objective_2
 % Levels are mutually exclusive (a base cannot be paired in two different levels). The more levels you use, the more decision variables you have, 
 % but the more complex pseudoknots you are able to predict. IPknot and BiokoP use 2 levels, to predict simple knots.\\
 % \begin{figure}[h]\centering \includegraphics[height=5cm]{levels.png}\end{figure}
- 	
- 	
- \subsection{Variables} 
- 	
+ 
+ 
+ \subsection{Variables}
+ 
 % Let $m$ be the number of levels that our program uses.\\
 Let $n$ be the number of nucleotides in the query RNA sequence $s$.\\
 Let $M$ be the set of modules that could be inserted in $s$ (a loop from $s$ has a good JAR3D score against one of the modules).\\
@@ -151,8 +151,8 @@ As the same module model can be inserted several times in $s$, several different
 Let $k_{x,i}$ be the size in nucleotides of that $i$th component of $x$.\\
 Let $y^u_v$ be the \textbf{decision boolean variable} indicating that $s[u]$ and $s[v]$ form a canonical base pairing. According to the standard loop model, we always have $v > u + 3$.\\
 Let $C^x_i$ be the \textbf{decision boolean variable} indicating that we do insert the $i$th component of module $x$ at position $P_{x,i}$.
- 	
- 	
+ 
+ 
 Note that a base pair $y^u_v$ is possible if and only if $v>u+3$, and that we do not need to use two variables $y^u_v$ and $y_{vu}$ for the same pair. 
 Then, we have $\sum_{i=4}^n (n-i)$ decision variables ($\approx \frac{1}{2}n^2$ decision variables) of the form $y^u_v$.
 Regarding the $C^x_i$, if we have an average insertion of $\nu$ motives by RNA sequence, the motives having in average $\mu$ components, components that can be inserted in average at $\pi$ different positions in $s$,
@@ -168,7 +168,7 @@ Let $X$ be the vector of all our decision variables, we define the following obj
 \[ f_{1B}(X) = \sum_{x \in M} p(x) \times C^x_1 \]
 \[ f_{1C}(X) = \sum_{x \in M} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\]
 $$ f_2(X) = \sum_{u<v} p_{uv}\times y^u_v \times I[p_{uv}>\theta], \qquad \qquad
- p_{uv} = \sum_{\sigma \in S(s)} y^u_v.p(\sigma | s)$$
+ 	p_{uv} = \sum_{\sigma \in S(s)} y^u_v.p(\sigma | s)$$
 The different $f_1$ objectives are supposed to maximize the number of inserted motives in $s$, 
 weighted by their number of components (squared for 1A), or the JAR3D score (B and C). 
 In $1C$, a penalty is added on the number of nucleotides involved in the looped zone (sum of $k_{x,i}$) to avoid long unpaired zones. 
@@ -183,7 +183,7 @@ Note that \(f_{1A}\) is taken from RNA MoIP \cite{reinharz_towards_2012}, to com
 \begin{equation} \label{constraint:1}
 	\sum_{v<u} y^v_u + \sum_{v>u} y^u_v \leq 1 \qquad\qquad \forall u \in \llbracket 1,n \rrbracket
 \end{equation}
- 	
+ 
 \paragraph{Constraints to forbid lonely base pairs} ~
 % \begin{equation} \label{constraint:2}
 % 	\sum_{v=u}^n y^{u-1}_v - \sum_{v=u+1}^n y^u_v + \sum_{v=u+2}^n y^{u+1}_v \geq 0 \qquad \qquad \forall u \in \llbracket 1,n\rrbracket
@@ -208,14 +208,14 @@ Then, this condition sets to zero "lonely decision variables" who have no neighb
 	(k_{x,i}-2) \; C^x_i + \sum_{u=P_{x,i}+1}^{P_{x,i}+k_{x,i}-2}\left[ \sum_{v>u} y^u_v + \sum_{v<u} y^v_u \right] \leq (k_{x,i} - 2)
 	\qquad \qquad \forall x \in M, i \in \llbracket 1,\|x\| \rrbracket
 \end{equation}
- 	
+ 
 \paragraph{Constraint to forbid component to overlap} ~
 \begin{equation} \label{constraint:5}
 	\sum_{x \in M} \sum_{i=1}^{\|x\|} C^x_i \times I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1]) \leq 1 \qquad \qquad \forall u \in \llbracket 1,n \rrbracket
 \end{equation}
 $I( u \in [ P_{x,i} ; P_{x,i}+k_{x,i}-1])$ is a booleean value depending on the condition's truth. Then, whatever the nucleotide $u$, it can be part of a module component only once.
 % \begin{center}\includegraphics[height=3cm]{component.png}\end{center}
- 	
+ 
 \paragraph{Constraints to respect the structure of large motives ($\{ x\in M \; | \; \|x\| \geq 2\}$)} ~ 
 
 This constraint ensures that none or all the components of a motif are inserted.