main_bioinformatics.tex 51.9 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
\documentclass{bioinfo}
\copyrightyear{2019} \pubyear{2019}

\access{Advance Access Publication Date: Day Month Year}
\appnotes{Manuscript Category}

\usepackage[ruled,vlined]{algorithm2e}
\usepackage{xcolor}

\begin{document}
\firstpage{1}

\subtitle{Subject Section}

\title[BiORSEO]{BiORSEO: A bi-objective method to predict RNA secondary structures with pseudoknots using RNA 3D modules}
\author[Becquey \textit{et~al}.]{Louis Becquey\,$^{\text{\sfb 1,}*}$, Eric Angel\,$^{\text{\sfb 1}}$ and Fariza Tahi\,$^{\text{\sfb 1}*}$}
\address{$^{\text{\sf 1}}$IBISC, Univ Evry, Universite Paris-Saclay, 91025, Evry, France}

\corresp{$^\ast$To whom correspondence should be addressed.}

\history{Received on XXXXX; revised on XXXXX; accepted on XXXXX}

\editor{Associate Editor: XXXXXXX}

\abstract{\textbf{Motivation:} RNA loops have been modelled and clustered from solved 3D structures into ordered collections of recurrent non-canonical interactions called "RNA modules", available in databases. This work explores what information from such modules can be used to improve secondary structure prediction. 
We propose a bi-objective method for predicting RNA secondary structures by minimizing both an energy-based and a knowledge-based potential. The tool, called \textsc{BiORSEO}, outputs secondary structures corresponding to the optimal solutions from the Pareto set.\\
\textbf{Results:}  We compare several approaches to predict secondary structures using inserted RNA modules information: two module data sources, Rna3Dmotif and The RNA 3D Motif Atlas, and different ways to score the module insertions: module size, module complexity, or module probability according to models like JAR3D and BayesPairing. We benchmark them against a large set of known secondary structures, \textcolor{red}{including some state-of-the-art tools, and comment on the usefulness of the half physics-based, half data-based approach}.\\ % Some of the tested methods present a good performance, especially on structures containing pseudoknots. They are compared to state of the art tools for secondary structure prediction.\\
\textbf{Availability:} The software \textcolor{red}{is} available for download on the \href{https://evryrna.ibisc.univ-evry.fr/evryrna/BiORSEO/}{EvryRNA website}, as well as the datasets.\\
\textbf{Contact:} \href{louis.becquey@univ-evry.fr}{louis.becquey@univ-evry.fr}, \href{fariza.tahi@univ-evry.fr}{fariza.tahi@univ-evry.fr}\\
\textbf{Supplementary information:} Supplementary sections are available at \textit{Bioinformatics}
online.}

\maketitle

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
Ribonucleic acid (RNA) is a macromolecule which is often single-stranded. Therefore, the strand has the ability to fold in space in more complex ways than DNA, that we mostly know to form double-stranded stems. A stem is a succession of basepairs called Watson-Crick basepairs, or "canonical", stacked on top of each other. As this can still happen with RNA, we also observe several other ways for a nucleotide to interact with another. For example, Leontis and Westhof proposed a classification of 12 non-canonical basepairs~(\citealp{leontis2001geometric}). Some of the nucleotides can also interact with the 2'OH of a ribose, or with a phosphate, or even not interact at all and bulge out the RNA structure.
   
\paragraph{Modelling RNA structures as graphs} ~ 
For modelling purposes, researchers working on computational problems involving RNA represent them with graphs.  A recent article~(\citealp{schlick2018adventures}) details the different graph models of RNAs and their respective advantages. 
We are particularly interested in the secondary structure graph of the RNA, i.e. a graph where the nucleotides are nodes, and backbone bonds and canonical basepairs are edges. In this kind of graph, the non-canonical interactions do not appear. 
As the problem of predicting the 3D structure of an RNA from sequence has been too computationally expensive for years, and is still difficult, a common first step has been to predict this secondary structure (2D) graph, by computing what regions will form stems and what regions will \textcolor{red}{not be Watson-Crick basepaired}, forming so-called loops. 
In many cases, the solution to the 2D folding problem is not unique, and RNAs have the ability to switch between several metastable conformations. Most approaches use dynamic programming schemes to compute the RNA partition function and/or canonical pairing probabilities, i.e. the probability for each nucleotide to form a canonical base-pair with every other nucleotide, or to remain unpaired~(\citealp{mccaskill1990equilibrium}). Once the  partition function is computed, several models exist to rebuild one or several best structure(s): we can choose the Minimum Free Energy (MFE) structure, the one that maximizes expected accuracy (MEA), or the centroid of the ensemble. The most used implementations are some low complexity implementations such as \texttt{RNAFold} and \texttt{Fold}~(\citealp{lorenz2011viennarna}, \citealp{mathews2004using}). An important limitation of these algorithms is their inability to model so-called \textit{pseudoknotted} structures, i.e. structures with basepairs $(i,j)$ and $(k,l)$ when $i<k<j<l$. 
Later, some variants taking pseudoknots into account were developped, e.g. in the NUPACK package~(\citealp{dirksAlgorithmComputingNucleic2004}) or \texttt{ProbKnot} from the RNAstructure package~(\citealp{bellaousov2010probknot}).
We can also cite Biokop (\citealp{legendre_bi-objective_2018}), a recent tool that uses both MFE and MEA criterions in a bi-objective framework, and returns optimal and suboptimal structures including pseudoknots.

RNAs can be seen as an assembly of stems and loops. To move from the planar 2D graph to 3D, one usually predicts the 3D structure of stems and loops separately. Stems are relatively easy to tackle because of the isostericity of the Watson-Crick basepairs; their structure has been widely observed and features low variability. On the other hand, to accurately model loops in 3D, one needs to take the non-canonical interactions into account. In many cases, two hairpin loops can even form new canonical basepairs to form \textit{kissing hairpins}, a particular pseudoknot type (called HHH) which is hard to predict because it involves a distant basepair between two unpaired loop regions. More nomenclature details about loops and pseudoknots are provided in Supplementary Section~A.

\paragraph{\textcolor{red}{Modelling loops as modules}} ~  Several works have gathered 3D crystal structures involving RNA chains, extracted the loops from those RNA chains and annotated the base contacts using MC-Annotate (\citealp{gendron2001quantitative}), FR3D (\citealp{sarver_fr3d:_2008}) or DSSR (\citealp{lu_dssr:_2015}). 
They model the loops with more detailed graphs describing non-canonical contacts on their edges. 
The graphs can then be clustered with respect to a similarity or isomorphism measure, and the sequence variations over the nucleotides of the loop can be modeled. 
Those models are called RNA \textit{modules}, i.e. an ordered collection of non-canonical basepairs or stacking interactions, leading to a conserved 3D shape in different RNA molecules. 
We can cite the work from (\citealp{djelloul_automated_2008}) with Rna3Dmotif, a pipeline that extracts terminal hairpin loops (HL), internal loops (IL), and multiple loops (ML) from structures annotated by FR3D, and can cluster them using a graph similarity metric. 
Another one is the RNA 3D Motif Atlas~(\citealp{petrov_automated_2013}), which does not support multiple loops, but clusters the loops using all sequence information, nucleotide contacts and shape information, which leads to loop module models with tolerance in sequence and length variations. 
\textcolor{red}{More recent ones are RNAMotifClusters (\citealp{ge2018novo}), and also} CaRNAval (\citealp{reinharz2018mining}), an approach that enables to model a wide variety of structural features such as multipairs, multi-stranded loops, and pseudoknots. 
To be exhaustive, we also can cite RNA Bricks 2 (\citealp{chojnowski2014rna}), which has the particularity to also study contacts with protein chains. 

\textcolor{red}{Further work has been done to help researchers detecting if some RNA sequence folds following some known module model, user-provided, or searched in a database.}
For example, JAR3D (\citealp{zirbel_identifying_2015}) can score the modules from the RNA 3D Motif Atlas against a query \textcolor{red}{loop} sequence. 
\textcolor{red}{With RMDetect~(\citealp{cruz2011sequence}), the authors proposed to build bayesian networks from any RNA module's graph to summarize all the observed sequence variants of the module. 
The metaRNAmodules pipeline (\citealp{theis2013automated}) can then be used to build bayesian networks for Djelloul \& Denise's modules and detect them in aligned RNA sequences.  
The recent BayesPairing tool (\citealp{sarrazin2019automated}) can do the same for modules from several databases, and allows to detect them in single RNA sequences.}

\paragraph{\textcolor{red}{Secondary structure prediction using RNA modules}} ~ In this paper, \textcolor{red}{we want to test if the knowledge that a subsequence matches well some module model helps identifying a loop in the RNA, when predicting the secondary structure.} 
In other words, we do not use the 3D data to \textcolor{red}{propose 3D conformations for the loops}, but we use it to score 2D conformations. 
A first attempt \textcolor{red}{to use modules to guide RNA 2D structure predictions} is RNA-MoIP~(\citealp{reinharz_towards_2012}). 
The tool \textcolor{red}{inserts modules from Rna3Dmotif in a set of candidate solutions} (often given by a simple tool like RNAsubopt~(\citealp{lorenz2011viennarna})) to \textcolor{red}{return a single} 2D structure with non-canonical base-pairs included, \textcolor{red}{which is then a better input} to give to the 3D reconstruction tool MC-Sym~(\citealp{parisien2008mc}), resulting in better prediction of 3D structures. 
\textcolor{red}{Another example is the use of JAR3D and metaRNAmodules by (\citealp{theis2015rna}) to predict secondary structures at genome-wide scale, using an alignment of 13 genomes. 
This last study shows the use of modules lowers the false discovery rate of secondary structure elements. 
This is encouraging, but as we focus more on short, unaligned single RNA sequences, we compare ourselves to RNA-MoIP in this study. 
Going further than metaRNAmodules, BayesPairing allows to detect modules from more databases and without sequence alignments, so we are rather considering it in this work.}

\textcolor{red}{When inserting modules into secondary structures, RNA-MoIP sometimes has to break basepairs. 
It tries to find a compromise between staying close to the initial structure, and insert more modules. Unfortunately, }it cannot distinguish important base-pairs from less important ones, and might break some of the ones stabilizing a whole stem while inserting a module, resulting in less probable structures as output. \textcolor{red}{Evidence is provided in Supplementary Section B.}
To \textcolor{red}{overcome this problem}, we design a method which builds a 2D structure by simultaneously placing base-pairs and modules in a single step, taking into account two objectives: the expected accuracy of the structure in the equilibrium ensemble fold, and a custom function that reflects the number and quality of inserted modules (several models are studied). 
This method leads to our new tool BiORSEO (Bi-Objective RNA Structure Efficient Optimizer). 
Our approach avoids using a weighted linear combination of the objectives as done in RNA-MoIP (which can miss interesting \textcolor{red}{so-called \textit{non-supported} solutions}). In this paper, we use a bi-objective Pareto-based approach, i.e. we identify all the non-dominated structures (the structures for which no other structure scores better on the two objectives).\\

The paper is organized as follows. In the next section, we present the module models sources, the insertion models and some objective functions, and the procedure to compare them. Then we present a benchmark of all those variants against reference tools in Section \ref{sec:results}, using two reference datasets. \textcolor{red}{We also discuss on the usefulness of the module criteria.}

\begin{figure*}[t]
   \includegraphics[width=\textwidth]{graph_abstract.jpg} 
   \caption{The information flow in our method. The inputs are the RNA sequence and module models from a database. First, we compute the pairing probabilities based on~(\citealp{dirksAlgorithmComputingNucleic2004}) and the probable insertion sites using 3 different methods listed in Section \ref{sec:models}. Then, we can define two objectives and linear constraints to compute the Pareto set of secondary structures for the input sequence, by solving a bi-objective integer linear program (ILP).}
   \label{fig:pipeline}
\end{figure*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{methods}
\section{Methods}\label{sec:methods}
We compare two databases of modules: (1) the module data-set used by RNA-MoIP ~(\citealp{reinharz_towards_2012}), in the DESC file format of Rna3Dmotif~(\citealp{djelloul_automated_2008}); (2) modules from the RNA 3D Motif Atlas v3.2, as provided on the BGSU website, see~(\citealp{petrov_automated_2013}) for more information. Our main procedure is the following:
\begin{itemize}
\item \textbf{Pattern-matching step:} Find all possible occurrences of known RNA modules in the query sequence, by finding subsequences of the query that score well with the probabilistic models of the modules (several models are compared).
\item \textbf{Constraints definition step:} Define constraints on the secondary structure imposed by modules if they would be included (in this case, \textcolor{red}{the closing basepairs of the module are mandatory}).
 \item \textbf{Optimization step:} Find a secondary structure that satisfies as much as possible both the expected accuracy of the structure and a criterion taking into account module inclusions, by solving a bi-objective integer linear programming (ILP) problem, using the constraints defined in the previous step.
\end{itemize}
 
The ILP framework used to define the constraints and solve the resulting optimization problem is similar to previous works like IPknot (\citealp{sato_ipknot:_2011}), RNA-MoIP~(\citealp{reinharz_towards_2012}) or Biokop (\citealp{legendre_bi-objective_2018}).
We chose the ILP approach because it has proven its outperformance with IPknot and Biokop when it comes to pseudoknot prediction, see (\citealp{legendre_bi-objective_2018}). In particular, any type of pseudoknot can be predicted. 

Figure \ref{fig:pipeline} summarizes the \textcolor{red}{whole} procedure on a graphical pipeline.

\subsection{Pattern matching step}\label{sec:models}
Several methods have been proposed to determine if a sequence (or a part of it) is likely to fold following a given module. \textcolor{red}{We benchmarked the following ones:}

\paragraph{Direct pattern matching} ~ The simplest approach when no statistical model is available is to use a regular expression and direct pattern matching against the input sequence. This is the approach used by RNA-MoIP. We used it with the Rna3Dmotif data as presented in RNA-MoIP's article ~(\citealp{reinharz_towards_2012}), dealing with special cases in the same way (very short components, wildcards).

\paragraph{JAR3D} ~ For each motif group in the RNA 3D Motif Atlas, \textcolor{red}{~(\citealp{zirbel_identifying_2015}) built} a probabilistic model for sequence variability. 
Their implementation, called JAR3D, takes user-provided loop sub-sequences in input and outputs a score for every motif of the Atlas on every provided loop. This method has the advantage to allow variations in sequence length compared to the module model. Unfortunately, it can only be used for hairpin and internal loops. Another major drawback is that it requires the computation of the pairing probabilities to first locate \textit{where} the most probable loops are, to give them as input to JAR3D. We therefore use RNAsubopt first to get the positions of the probable loops. JAR3D has been developed for modules from the RNA 3D Motif Atlas and can only be tested with them.

\paragraph{\textcolor{red}{BayesPairing's sequence probability distributions}} ~ When we have data for several instances of a module, we can estimate a probabilistic distribution of the nucleotides over the module nodes. A first intuitive approach is to use the base frequencies. But as paired nucleotides are not independent at all, it is more rigorous to model those dependencies. An approach proposed in ~(\citealp{cruz2011sequence}) is to transform the module's graph into a Bayesian network, which models the dependencies between nucleotide probabilities at every node of the graph. 
\textcolor{red}{BayesPairing~(\citealp{sarrazin2019automated}) automates the building of bayesian networks for every module.}
A large number of sequences are sampled using the Bayesian network, and they are pattern-matched against the query to find occurrences. 
An additional step compares the free energy of the structure with and without the constraint of each matched module, and selects only the candidate sites that do not deteriorate too much the energy. 
This last step is \textcolor{red}{not required here}, first because it \textcolor{red}{repeats the calculation} of the partition function, and then because we would try to insert modules \textcolor{red}{with BiORSEO} that were pre-selected \textcolor{red}{by BayesPairing} to be appropriate. 
Therefore we chose to ignore this last step and let our optimizer select the pertinent modules in the candidates. BayesPairing can be used for both data sources.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Constraints definition step and IP model} \label{sec:ip}
The full list of variables we used to model the problem in an integer linear program and the linear formulation of each constraint are detailed in Supplementary Section C. Here we propose different objective functions to score the candidate module insertions. %, whose performances are compared in section \ref{sec:results}.

\paragraph{Notations} ~ We call \textit{component} a piece of strand which forms an unpaired portion of a module. A HL has one, an IL or bulge has two, and a ML has more (see Supplementary Section A). Components of a module are linked together by canonical base-pairs at their extremities to form a loop. Let $x$ be a module which could be inserted at some defined position in the sequence. Let $\|x\|$ be the number of components of this module, and $k_{x,i}$ the nucleotide count of the $i$th component of $x$. When a scoring model is used (JAR3D or BayesPairing), we denote $p(x)$ the score value of $x$ inserted at the defined position. Let $p_{uv}$ be the probability for nucleotides $u$ and $v$ (with $v>u+3$) to form a canonical base-pair. We use NUPACK's dynamic programming scheme~(\citealp{dirksAlgorithmComputingNucleic2004}), which supports pseudoknots, to compute such probabilities. We denote $y^u_v$ the binary decision variable indicating that these nucleotides do form a canonical base pair, and $C^x_1$ the decision binary variable indicating whether the module $x$ will be inserted or not. The resolution of the ILP outputs solutions by fixing definitive values for the different $y^u_v$ and $C^x_1$. 

\paragraph{Objective functions} ~ The more modules that are included, the more information about set and unset base-pairs, and eventually about tertiary folds of the loops in space. 
So maximizing the number of modules could be a valid criteria. But, a disadvantage of such a criteria is that it penalizes MLs with large $\|x\|$, because the insertion of a ML forbids at the same time the insertion of several ILs or bulges in place. 
RNA-MoIP uses the sum of the squared nucleotide count over the components to try to encourage large modules~(\citealp{reinharz_towards_2012}). This is our first benchmarked criteria $f_{1A}$. 
Conversely, we could also try to maximize the number of components $\|x\|$ in the module. 
Then, we \textcolor{red}{also could} penalize a module insertion by the logarithm of the number of nucleotides involved in the looped zone (sum of the $k_{x,i}$) to avoid very long unpaired zones. 
We introduce such a penalty in criteria $f_{1B}$. We also define two more criteria, $f_{1C}$ which uses only the score returned by JAR3D or BayesPairing, and $f_{1D}$ which includes all the presented terms. 
Let $X$ be the set of all our decision variables, then the different objective functions to maximize are:
\begin{equation}f_{1A}(X) = \sum_{x} \sum_{i=1}^{\|x\|} k_{x,i}^2 \times C^x_1\label{eq:A}\end{equation}
\begin{equation}f_{1B}(X) = \sum_{x} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times C^x_1 \right] \label{eq:B}\end{equation}
\begin{equation}f_{1C}(X) = \sum_{x} p(x) \times C^x_1 \label{eq:C}\end{equation}
\begin{equation}f_{1D}(X) = \sum_{x} \left[ \frac{\|x\|}{\log_2(\sum_{i=1}^{\|x\|}k_{x,i})} \times p(x) \times C^x_1 \right]\label{eq:D}\end{equation} \\
Regarding the second objective, aimed at maximizing the expected accuracy of the structures, we use $f_2(X) = \sum y^u_v \times p_{uv} \times I[p_{uv}>\theta]$. As first proposed by~(\citealp{sato_ipknot:_2011}), $f_2$ uses a parameter $\theta = 0.001$ to ignore very unlikely base-pairs. This prevents the explosion of the number of variables and allows a fast resolution of the IP problem.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Optimization step}
We use a simple dichotomic search algorithm (presented in Figure \ref{fig:findP}) to find the Pareto set of the bi-objective problem. On the first pass of the dichotomy, it solves iteratively a mono-objective problem with a constraint on the second objective, requiring it to be in an interval $[\lambda_{min}, \lambda_{max}]$. For example, if we decide to maximize objective 1; every-time a new non-dominated solution is found, $\lambda_{min}$ is set just above the new solution's objective 2 value. We then search and find another solution with a worse objective 1 value but a higher objective 2 value than previously found solutions. The second pass of the dichotomy searches below newly found solutions. In fact, it is required to search for superposed solutions to Pareto optimal ones. This is important when the criteria used to rank inserted modules is not able to separate them very well; many solutions therefore get the same $f_1$ score. This algorithm is implemented  in C++ using the CPLEX solver concert technology (\href{https://www.ibm.com/analytics/optimization-modeling-interfaces}{ILOG CPLEX Optimizer 12.8}).
\begin{figure}[!tbp]
\begin{algorithm}[H]
F:= $\emptyset$\;
\tcp{find the extrema of the Pareto front:}
L1:= maximize($f_1$, $-\infty$, $+\infty$, F)\;
L2:= maximize($f_2$, $-\infty$, $+\infty$, F)\;
\tcp{Add L1 to the results:}
\textit{R} := $\{$L1$\}$\;
\tcp{search on top of L1:}
search\_between($f_2(\text{L1}) + \epsilon$, $f_2(\text{L2})$)\;
\tcp{search if solutions superposed to L1 exist:}
search\_between($-\infty$, $f_2(\text{L1})$)\;
\Return{R}\;
\caption{FindParetoSet()}
\end{algorithm}

\begin{algorithm}[H]
$s$:= maximize($f_1$, $\lambda_{min}$, $\lambda_{max}$, F)\;
\If{$s \neq \emptyset$}{
   F:= F $\cup \{s\}$\;
   \If{$\nexists x \in R$ \textcolor{red}{such that} $x>s$}{
         \tcp{solution is undominated, add it to \textit{R}}
       \textit{R} := \textit{R} $\cup \{s\}$\;
       \While{$\exists x \in R$ \textcolor{red}{such that} $s>x$}{
             \tcp{remove dominated solutions}
           \textit{R} := \textit{R}$\setminus \{x\}$\; 
       }
         \tcp{search on top of $s$}
       search\_between($f_2(s) + \epsilon$, $\lambda_{max}$)\;
       \If{$\lambda_{max} - \lambda_{min} > \epsilon$}{
             \tcp{search if another solution superposed to $s$ exists}
           search\_between($\lambda_{min}$, $f_2(s)$)\;
       }
   }
}
\caption{search\_between($\lambda_{min}$, $\lambda_{max}$)}
\end{algorithm}

\caption{The dichotomic search algorithm to find the Pareto set. F is the ensemble of already-found structures which grows over time, and that we forbid the solver to find again. R is the set of Pareto-optimal solutions. L1 and L2 are the best solutions to the mono-objective problems regarding $f1$ and $f2$. $\mathbf{maximize}$($f$, $\lambda_{min}$, $\lambda_{max}$, F) is a procedure that \textcolor{red}{maximizes} the function $f$ (mono-objective IP problem) under the constraint that the other one has to be in interval $[\lambda_{min}$, $\lambda_{max}]$, and with the solutions in F forbidden. The inequality sign $a>b$ between two solutions denotes that solution $a$ dominates solution $b$.}\label{fig:findP}
\end{figure}
\end{methods}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Results}\label{sec:results}
\begin{figure*}[!tbp]
   \includegraphics[width=\textwidth]{Benchmark_unconstrained.jpg}
   \caption{\textcolor{red}{First, second and third quartiles of the max MCC for each RNA, for all tools and BiORSEO variants.} 
   (A); Results of the methods that cannot find pseudoknots on the RNA-Strand dataset: RNAsubopt, RNA-MoIP, and the 14 variants of BiORSEO's bi-objective methods with a constraint that explicitly forbids pseudoknots. 
   (B); Results of the methods which allow pseudoknot predictions on the RNA-Strand dataset: Biokop, and the 14 BiORSEO variants without the no-pseudoknot constraint. 
   (C); Results of every method on the Pseudobase dataset. 
   \textcolor{red}{Due to the combinatorial issues commented in section \ref{sec:mat}, we counted only RNAs that could be predicted with every method: 287/344 in (A), 281/344 in (B), and 236/264 in (C).}}
   \label{fig:benchmark}
\end{figure*}
All the method \textcolor{red}{variants} introduced return an ensemble of possible secondary structures \textcolor{red}{(the Pareto set)} for a given input sequence. 
We compare them in a benchmark. \textcolor{red}{A small case study on three well known RNAs can be found in Supplementary Section D.}
\subsection{Benchmark protocol} \label{sec:bench}

\paragraph{Benchmark data sources} \label{sec:data}
A first dataset of RNA secondary structures was extracted from the RNA-Strand database ~(\citealp{andronescu2008rna}). We selected the RNAs for which experimental proof of the structure exists, with size varying between 10 and 100 nucleotides. Sequences containing modified nucleotides were discarded. The resulting set contains 344 secondary structures of various RNA families, 74 of them containing pseudoknots. We repeated the experiments twice: first, by forbidding explicitly the formation of pseudoknots with additional constraints (for fair comparison with RNA-MoIP). 
Then, a second experiment without such limitation, to reach maximum performance. In addition, to explicitly assess the performance on pseudoknotted RNAs, methods were tested on a second collection of 264 pseudoknotted-only RNAs from the Pseudobase database~(\citealp{van2000pseudobase}), covering all pseudoknot families, and of the same length range.

\paragraph{Reference comparison methods}
To study the usefulness of the module data sources, objective functions, and module placement methods, we added state-of-the art tools to the comparison. 
The same RNA sequences were submitted to RNAsubopt + RNA-MoIP for direct performance comparison. 
\textcolor{red}{In 'one by one' mode, RNA-MoIP inserts modules in every solution from RNAsubopt one by one, and we select the best solution ourselves like we do with BiORSEO (see the following paragraph \textit{Metrics}). In 'chunk' mode, RNA-MoIP takes all the RNAsubopt predictions at once and returns only one solution, supposed to be the best in the input set according to its own objective function.} 
We used RNAsubopt as a reference method without pseudoknot support, because it is fast, widely used, easy to understand and returns several solutions \textcolor{red}{that are in an energy range from the MFE (default parameter used)}. 
We used Biokop, \textcolor{red}{another} bi-objective ILP framework, as a reference method for prediction of secondary structures with pseudoknots. 
Both tools \textcolor{red}{RNAsubopt and Biokop out-perform} other state-of-the-art tools in their respective categories \textcolor{red}{(see  (\citealp{lorenz2011viennarna}) and  (\citealp{legendre_bi-objective_2018})} for more benchmarks against other tools\textcolor{red}{)}.

\paragraph{Metrics} ~ We compute the Matthews correlation coefficient (MCC) between the real secondary structure and every proposed structure. 
The coefficient is defined as:
{\small
\begin{equation}
   MCC = \frac{TP. TN - FP. FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}. \label{eq:MCC}
\end{equation}
}\noindent
where TP, TN, FP and FN are the number of true/false positive and negative predicted base pairs.
\textcolor{red}{The choice of MCC over accuracy or F1 score is justified by the large difference between the size of the classes: in any secondary structure, there exist much more pairs of nucleotides that do not interact than pairs that do.
Here, we wonder if the "true" structure (which is only a possible state that has been observed and reported in a database) can be found in the Pareto set.
Therefore, we chose to keep the maximum MCC value found over the set of proposed structures as a metric of the method's performance, showing that a solution is close or not to this "true" structure. This is, to our knowledge, the best way to relate if the reference structure is part of our ensemble of solutions.} For comprehensiveness, results with average MCC are also provided in Supplementary Section E. 


\subsection{Benchmark results}
Performance results under the form of \textcolor{red}{maximum} MCC are summarized in Figure \ref{fig:benchmark}. No data source, nor objective function taken alone performs significantly better than the other ones.
Without pseudoknots, RNAsubopt out-performs the other tools (see Figure \ref{fig:benchmark}(A)). The second most performing model is \textcolor{red}{BiORSEO, using Rna3Dmotifs modules, placed by regular expression searches only, and scored with module size ($f_{1A}$). } With pseudoknots support, most of the RNAs are predicted with small pseudoknots as the method allows it. As Figure \ref{fig:benchmark}(B) shows, the methods \textcolor{red}{are still all very close in terms of performance, including Biokop} which performs \textcolor{red}{as well} without module information. \textcolor{red}{The best BiORSEO variant is the same, and it out-performs Biokop for 60 RNAs, which is only 17\% of the cases. }
The results on the dataset of pseudoknotted-only structures are presented on Figure~\ref{fig:benchmark}(C).
\textcolor{red}{Again}, the 14 variants are very close and no module source, pattern-matching method nor objective function distinguishes itself. \textcolor{red}{But this time, Biokop performs better in average.}

\paragraph{\textcolor{red}{Number of solutions}} ~ \textcolor{red}{As shown in Figure~\ref{fig:nsol}}, \textcolor{red}{all the BiORSEO models} return very small sets of unique secondary structures, some of them being one optimal solution sometimes for example when using JAR3D. Meanwhile, RNAsubopt returns from one to \textcolor{red}{tens} of solutions (with \textcolor{red}{default energy gap} settings)\textcolor{red}{, and Biokop even more. The RNA-MoIP goal is conserved here with our approach: identify a secondary structure among the MEA suboptimals (or a few, but rarely more than 5) based on its compatibility with known modules. Note that the selection of the max MCC solution in the results set naturally favors RNAsubopt and Biokop in our benchmark since they return way more solutions.}
\begin{figure}[!tbp]
  \includegraphics[width=\linewidth]{Nsol.jpg}
  \caption{\textcolor{red}{Number of unique secondary structures in the Pareto set or returned ensemble. Data is from the RNAStrand dataset. Pseudoknots are allowed. Note that this is not the size of the Pareto set, one unique secondary structure can often be found several times in the Pareto set, with different module combinations inserted in. The default RNA-MoIP usage, labelled "chunk" here, always returns only one solution selected by itself.}}
  \label{fig:nsol}
\end{figure}
\begin{figure}[!tbp]
  \includegraphics[width=\linewidth]{Nmotifs.jpg}
  \caption{\textcolor{red}{(A) Maximum number of modules inserted in a solution of the set. (B) Ratio of the number of modules inserted in the solution which is closest to the true structure (i.e. the max MCC solution) and the maximum number of inserted modules in a solution. Only RNAs for which the Pareto set contains more than one solution are counted (the count is given under the distribution). Data is from the RNAStrand dataset. Pseudoknots are allowed.}}
  \label{fig:info}
\end{figure}
\begin{figure*}[!tbp]
  \includegraphics[width=\linewidth]{kernels_A.jpg}
  \caption{\textcolor{red}{Position of the best solution in the Pareto set for 4 variants using objective $f_{1A}$, after normalization on both axis. Computed on the RNA-Strand dataset with pseudoknots allowed. RNAs for which the Pareto set is a single optimal solution, or a set of solutions with the same secondary structure but different modules inserted into it, are ignored. The final number of considered RNAs is given on the figure.}}
  \label{fig:pareto}
\end{figure*}
\textcolor{red}{
\subsection{Usefulness of the two objectives}
To further study the RNA modules criteria usefulness, we looked at the maximum number of inserted modules over the Pareto solutions proposed for each RNA (Figure~\ref{fig:info}(A)), and the ratio between the number of inserted modules in the best prediction in the Pareto set (the max MCC solution) and this maximum number (Figure~\ref{fig:info}(B)).
When this ratio is 1, the best solution is the one with the most modules detected in (or one of the solutions with the most modules). The results show that in large majority, this ratio is 1. 
In particular, the variants which use direct pattern matching can insert more modules in average (up to 12) and this ratio is always at one. This is encouraging, showing that the best solutions do contain a lot of modules inserted.\\
Then, we display a solution space plot where we report the position of the best solution in the Pareto set after a normalization step, to directly observe where the best solutions are located in the MEA and RNA modules bi-objective plane. Variants which use $f_{1A}$ are presented on Figure \ref{fig:pareto}, plots for the other criteria are available in Supplementary Section F, they are close to what happens with $f_{1A}$. We only consider Pareto sets for which there exist more than one secondary structure solution in. 
First, we observe that the more Pareto sets we eliminate for this reason, the more the criteria are correlated, resulting in a single optimal solution. For example, we can say that $f_{1A}$ when used with JAR3D is correlated to MEA because in 312 cases on 344 an optimal secondary structure is found (most right plot on Figure~\ref{fig:pareto}). The eight variants which use BayesPairing are the less correlated with MEA, resulting in well spread positions of the best solution across the solution space. 
But the most interesting variant is Rna3Dmotifs + direct pattern-matching + $f_{1A}$, which performs the best. When several secondary structures are proposed, the best one is more often on the module criteria side than the MEA (most left plot on Figure~\ref{fig:pareto}; most of the best solutions have an abscissa of 1.0 or close). Therefore in this variant's case, when the criteria are not perfectly correlated, $f_{1A}$ is able to select the good solutions which are not the best on MEA.
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussion}
\paragraph{Comparison to RNA-MoIP} ~  An interesting point is the improvement between RNA-MoIP and our bi-objective variant which uses direct-pattern matching to spot insertion sites and $f_{1A}$ to score the insertions. This variant only differs from RNA-MoIP because it is bi-objective.  
Then, the Pareto approach really improves the structure prediction by itself. This result supports our hypothesis about RNA-MoIP breaking important basepairs.

\paragraph{\textcolor{red}{About pseudoknots predictions}} ~ 
The support of pseudoknots \textcolor{red}{and a lower number of solutions returned are the real interesting features} about BiORSEO. \textcolor{red}{It returns several} solutions, some with pseudoknots, and some without. As we are looking at the max MCC here, the appropriate solution is selected for each RNA. \textcolor{red}{Fortunately, the number of solutions returned is always smaller than the state-of-the-art tools. This makes the use of the maximum MCC acceptable as a metric.}
However, pseudoknot prediction quality is still difficult to assess with a metric like MCC, because a pseudoknot could be involving only a few base-pairs. Finding them or not does not alter much the MCC even if the structure is much more right or wrong from a biological point of view. Unfortunately, no automated verification method exists yet to our knowledge. \textcolor{red}{We illustrate this issue with the G riboswitch pseudoknot example in Supplementary Sections D2 and D3. The pseudoknot is sometimes found, but not with the exact same list of basepairs, which is penalized by the MCC.} 
\paragraph{\textcolor{red}{About RNA modules}}
\textcolor{red}{
On one side, the state-of-the-art~(\citealp{reinharz_towards_2012, theis2015rna}), the number of inserted modules in the best solutions (Figure \ref{fig:info}), and the position of the best solutions in the Pareto sets (Figure \ref{fig:pareto}) argue that criteria related to known modules are relevant. They should bring information from data to assist the theoretical model. But on the other side, Biokop is  \textit{in fine} equal or above BiORSEO in terms of performance, while Biokop does not use modules. The simplest explanation is that the MFE criterion it uses is important, and we lost information when we replaced it by a module criterion in BiORSEO.
}

\paragraph{On the objective functions} ~ 
Regarding objective functions to include modules, the different criteria proposed seem to give comparable results at first sight regarding the average performance and the dispersion. However, an important difference between $f_{1A}$, $f_{1B}$ on one side, and $f_{1C}$, $f_{1D}$ on the other side, is about the computation time. As $f_{1A}$, $f_{1B}$ do not use a score to rank potential module insertion sites, every module of the same size can be equally inserted. When the RNA presents several loops, the combinatorial possibilities grow fast with the number of modules in the dataset. Therefore, the number of undominated solutions can reach several hundreds or thousands even for short sequences. Such large Pareto sets are not informative for our application, because they consist \textcolor{red}{of} very redundant secondary structures with different module references, which are counted only for one secondary structure solution at the end. On the other hand, $f_{1C}$ and $f_{1D}$ require the run of an additional tool (JAR3D or BayesPairing) to score the insertion sites. Given an RNA, a compromise must be found according to its length and amount of loops.

\paragraph{The bias with JAR3D} ~ One should keep in mind that JAR3D takes as input the sequences of RNA loops to score modules against them. We detect the loops in the RNA sequence with RNAsubopt. This use of JAR3D is biased, since we score modules on sequence portions that we already know likely to form loops and  unlikely to form stems, so the information brought by a module insertion is low. \textcolor{red}{This is why we find it correlated to MEA.}

\paragraph{About BayesPairing} ~ Please remind that we skipped BayesPairing's last step which checks if the insertion of a module would deteriorate too much the energy. We did so because our bi-objective framework was supposed to be able to do it itself. The low performance of BayesPairing in the benchmark is not an argument against it when used in its intended purpose.

\paragraph{Computation times and material limits} \label{sec:mat}~ 
As evocated in Figure \ref{fig:benchmark}, the methods were not all able to fold all the sequences. The missing ones often are from a combination of direct pattern-matching or JAR3D with $f_{1A}$ or $f_{1B}$. \textcolor{red}{On the other hand, direct-pattern matching with $f_{1B}$ and in particular $f_{1A}$ also are the fastest methods. Computation time examples are provided in the case study in Supplementary Section D.}
The RAM, not the time, typically limits the size of the RNAs the methods can process. RNAs up to 230 bases are fine \textcolor{red}{on our workstation with 16GB of RAM}. A prediction typically takes a few seconds, sometimes minutes. The time required grows with both the nucleotide count, the number of similar loops, and the number of similar modules insertible on those similar loops. The objective functions $f_{1A}$ and $f_{1B}$, which are not sequence-dependant, were sometimes not discriminative enough and equally ranked a large number of combinatorial ordering variations of the same modules on the same loops. For that reason, we had to arbitrarily stop the jobs exceeding 500 structures in the Pareto set, because they would require several hours to complete, leading to incomplete dataset predictions.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{-0.5cm}
\section{Conclusion}
We developed a general bi-objective method to benchmark different sources of RNA modules models (the RNA 3D Motif Atlas and Rna3Dmotif), different methods to place them in sequences (direct pattern matching, BayesPairing, and JAR3D), and different scoring functions. The bi-objective method uses \textcolor{red}{a theoretical criterion,} the expected accuracy of the structure, and \textcolor{red}{a data-based criterion, one of} the previous scoring functions, to select relevant secondary structures.

Our models \textcolor{red}{out-perform} RNA-MoIP, a previous attempt to predict better secondary structures using module information and a linear combination of two objectives into a scoring function. Our simplest best-performing new method \textcolor{red}{uses Rna3Dmotifs placed in sequences by regular expression searches and scored by module size. It} could be interpreted as an upgraded RNA-MoIP with a real bi-objective framework, which predicts the base pairs and the module insertions in a row, preventing the insertion to break important base-pairs. \textcolor{red}{However, no module model distinguishes itself from the other. Regarding module detection methods, JAR3D seems biased and results in correlated criteria. The use of direct pattern matching is simple and efficient but leads to combinatorial issues on some RNAs, because we cannot score the insertion sites so that one beats the other candidates. Even if no variant of the method has a better predictive performance than Biokop in average (which performs the same or better using MFE but no module information), the module criterion improves the prediction in  17\% of the cases.} \textcolor{red}{In a future work, we should then develop a tri-objective optimization program including MFE as another criterion.}

Improvement perspectives relies on the hope newer databases like CaRNAval~(\citealp{reinharz2018mining}) that contain more recent and more diverse module models \textcolor{red}{including real pseudoknotted interaction networks},
bring more information to assist the energy criteria. 

\begin{thebibliography}{25}
   \bibitem[Andronescu {\em et~al.}(2008)Andronescu, Bereg, Hoos, and
     Condon]{andronescu2008rna}
   Andronescu, M., Bereg, V., Hoos, H.~H., and Condon, A. (2008).
   \newblock Rna strand: the {RNA} secondary structure and statistical analysis
     database.
   \newblock {\em BMC bioinformatics\/}, {\bf 9}(1), 340.
   
   \bibitem[Bellaousov and Mathews(2010)Bellaousov and
     Mathews]{bellaousov2010probknot}
   Bellaousov, S. and Mathews, D.~H. (2010).
   \newblock Probknot: fast prediction of {RNA} secondary structure including
     pseudoknots.
   \newblock {\em RNA\/}, {\bf 16}(10), 1870--1880.
   
   \bibitem[Chojnowski {\em et~al.}(2014)Chojnowski, Wale{\'n}, and
     Bujnicki]{chojnowski2014rna}
   Chojnowski, G., Wale{\'n}, T., and Bujnicki, J.~M. (2014).
   \newblock Rna bricks-a database of {RNA} 3d motifs and their interactions.
   \newblock {\em Nucleic acids research\/}, {\bf 42}(D1), D123--D131.
   
   \bibitem[Cruz and Westhof(2011)Cruz and Westhof]{cruz2011sequence}
   Cruz, J.~A. and Westhof, E. (2011).
   \newblock Sequence-based identification of 3d structural modules in {RNA} with
     rmdetect.
   \newblock {\em Nature methods\/}, {\bf 8}(6), 513.
   
   \bibitem[Dirks and Pierce(2004)Dirks and
     Pierce]{dirksAlgorithmComputingNucleic2004}
   Dirks, R.~M. and Pierce, N.~A. (2004).
   \newblock An algorithm for computing nucleic acid base-pairing probabilities
     including pseudoknots.
   \newblock {\em Journal of Computational Chemistry\/}, {\bf 25}(10), 1295--1304.
   
   \bibitem[Djelloul and Denise(2008)Djelloul and Denise]{djelloul_automated_2008}
   Djelloul, M. and Denise, A. (2008).
   \newblock Automated motif extraction and classification in {RNA} tertiary
     structures.
   \newblock {\em RNA\/}, {\bf 14}(12), 2489--2497.

   \bibitem[Ge {\em et~al.}(2018)Ge, Islam, Zhong and Zhang]{ge2018novo}
   Ge, P., Islam, S., Zhong, C., and Zhang, S. (2018). 
   \newblock De novo discovery of structural motifs in RNA 3D structures through clustering. 
   \newblock {\em Nucleic acids research\/}, {\bf 46}(9), 4783--4793.
   
   \bibitem[Gendron {\em et~al.}(2001)Gendron, Lemieux, and
     Major]{gendron2001quantitative}
   Gendron, P., Lemieux, S., and Major, F. (2001).
   \newblock Quantitative analysis of nucleic acid three-dimensional structures.
   \newblock {\em Journal of molecular biology\/}, {\bf 308}(5), 919--936.
      
   \bibitem[Legendre {\em et~al.}(2018)Legendre, Angel, and
     Tahi]{legendre_bi-objective_2018}
   Legendre, A., Angel, E., and Tahi, F. (2018).
   \newblock Bi-objective integer programming for {RNA} secondary structure
     prediction with pseudoknots.
   \newblock {\em BMC Bioinformatics\/}, {\bf 19}(1), 13.
   
   \bibitem[Leontis and Westhof(2001)Leontis and Westhof]{leontis2001geometric}
   Leontis, N.~B. and Westhof, E. (2001).
   \newblock Geometric nomenclature and classification of {RNA} base pairs.
   \newblock {\em RNA\/}, {\bf 7}(4), 499--512.
      
   \bibitem[Lorenz {\em et~al.}(2011b)Lorenz, Bernhart, Zu~Siederdissen, Tafer,
     Flamm, Stadler, and Hofacker]{lorenz2011viennarna}
   Lorenz, R., Bernhart, S.~H., Zu~Siederdissen, C.~H., Tafer, H., Flamm, C.,
     Stadler, P.~F., and Hofacker, I.~L. (2011b).
   \newblock ViennaRNA Package 2.0.
   \newblock {\em Algorithms for Molecular Biology\/}, {\bf 6}(1), 26.
   
   \bibitem[Lu {\em et~al.}(2015)Lu, Bussemaker, and Olson]{lu_dssr:_2015}
   Lu, X.-J., Bussemaker, H.~J., and Olson, W.~K. (2015).
   \newblock {DSSR}: an integrated software tool for dissecting the spatial
     structure of {RNA}.
   \newblock {\em Nucleic Acids Research\/}, {\bf 43}(21), e142--e142.
   
   \bibitem[Mathews(2004)Mathews]{mathews2004using}
   Mathews, D.~H. (2004).
   \newblock Using an {RNA} secondary structure partition function to determine
     confidence in base pairs predicted by free energy minimization.
   \newblock {\em RNA\/}, {\bf 10}(8), 1178--1190.
   
   \bibitem[McCaskill(1990)McCaskill]{mccaskill1990equilibrium}
   McCaskill, J.~S. (1990).
   \newblock The equilibrium partition function and base pair binding
     probabilities for {RNA} secondary structure.
   \newblock {\em Biopolymers: Original Research on Biomolecules\/}, {\bf
     29}(6-7), 1105--1119.
   
   \bibitem[Parisien and Major(2008)Parisien and Major]{parisien2008mc}
   Parisien, M. and Major, F. (2008).
   \newblock The mc-fold and mc-sym pipeline infers {RNA} structure from sequence
     data.
   \newblock {\em Nature\/}, {\bf 452}(7183), 51.
   
   \bibitem[Petrov {\em et~al.}(2013)Petrov, Zirbel, and
     Leontis]{petrov_automated_2013}
   Petrov, A.~I., Zirbel, C.~L., and Leontis, N.~B. (2013).
   \newblock Automated classification of {RNA} 3d motifs and the {RNA} 3d {Motif}
     {Atlas}.
   \newblock {\em RNA\/}, {\bf 19}(10), 1327--1340.
   
   \bibitem[Reinharz {\em et~al.}(2012)Reinharz, Major, and
     Waldisp{\"u}hl]{reinharz_towards_2012}
   Reinharz, V., Major, F., and Waldisp{\"u}hl, J. (2012).
   \newblock Towards 3d structure prediction of large {RNA} molecules: an integer
     programming framework to insert local 3d motifs in {RNA} secondary structure.
   \newblock {\em Bioinformatics\/}, {\bf 28}(12), i207--i214.
   
   \bibitem[Reinharz {\em et~al.}(2018)Reinharz, Soul{\'e}, Westhof,
     Waldisp{\"u}hl, and Denise]{reinharz2018mining}
   Reinharz, V., Soul{\'e}, A., Westhof, E., Waldisp{\"u}hl, J., and Denise, A.
     (2018).
   \newblock Mining for recurrent long-range interactions in {RNA} structures
     reveals embedded hierarchies in network families.
   \newblock {\em Nucleic Acids Research\/}, {\bf 46}(8), 3841--3851.
   
   \bibitem[Sarrazin-Gendron {\em et~al.}(2019)Sarrazin-Gendron, Reinharz, Oliver,
     Moitessier, and Waldisp{\"u}hl]{sarrazin2019automated}
   Sarrazin-Gendron, R., Reinharz, V., Oliver, C.~G., Moitessier, N., and
     Waldisp{\"u}hl, J. (2019).
   \newblock Automated, customizable and efficient identification of 3d base pair
     modules with bayespairing.
   \newblock {\em Nucleic acids research\/}.
   
   \bibitem[Sarver {\em et~al.}(2008)Sarver, Zirbel, Stombaugh, Mokdad, and
     Leontis]{sarver_fr3d:_2008}
   Sarver, M., Zirbel, C.~L., Stombaugh, J., Mokdad, A., and Leontis, N.~B.
     (2008).
   \newblock {FR}3d: finding local and composite recurrent structural motifs in
     {RNA} 3d structures.
   \newblock {\em Journal of Mathematical Biology\/}, {\bf 56}(1), 215--252.
   
   \bibitem[Sato {\em et~al.}(2011)Sato, Kato, Hamada, Akutsu, and
     Asai]{sato_ipknot:_2011}
   Sato, K., Kato, Y., Hamada, M., Akutsu, T., and Asai, K. (2011).
   \newblock {IPknot}: fast and accurate prediction of {RNA} secondary structures
     with pseudoknots using integer programming.
   \newblock {\em Bioinformatics\/}, {\bf 27}(13), i85--i93.
   
   \bibitem[Schlick(2018)Schlick]{schlick2018adventures}
   Schlick, T. (2018).
   \newblock Adventures with {RNA} graphs.
   \newblock {\em Methods\/}, {\bf 143}, 16--33.
   
   \bibitem[Theis {\em et~al.}(2013)Theis, Zu Siederdissen, Hofacker and Gorodkin]{theis2013automated}
   Theis, C., Zu Siederdissen, C., Hofacker, I. L., and Gorodkin, J. (2013). 
   \newblock Automated identification of RNA 3D modules with discriminative power in RNA structural alignments. 
   \newblock {\em Nucleic acids research\/}, {\bf 41}(22), 9999--10009.

   \bibitem[Theis {\em et~al.}(2015)Theis, Zirbel, Zu Siederdisse, Anthon, Hofacker, Nielsen and Gorodkin]{theis2015rna}
   Theis, C., Zirbel, C. L., Zu Siederdissen, C. H., Anthon, C., Hofacker, I. L., Nielsen, H., and Gorodkin, J. (2015). 
   \newblock RNA 3D modules in genome-wide predictions of RNA 2D structure. 
   \newblock {\em PloS One}, {\bf 10}(10), e0139900.

   \bibitem[Van~Batenburg {\em et~al.}(2000)Van~Batenburg, Gultyaev, Pleij, Ng,
     and Oliehoek]{van2000pseudobase}
   Van~Batenburg, F., Gultyaev, A.~P., Pleij, C., Ng, J., and Oliehoek, J. (2000).
   \newblock Pseudobase: a database with rna pseudoknots.
   \newblock {\em Nucleic Acids Research\/}, {\bf 28}(1), 201--204.
   
   \bibitem[Zirbel {\em et~al.}(2015)Zirbel, Roll, Sweeney, Petrov, Pirrung, and
     Leontis]{zirbel_identifying_2015}
   Zirbel, C.~L., Roll, J., Sweeney, B.~A., Petrov, A.~I., Pirrung, M., and
     Leontis, N.~B. (2015).
   \newblock Identifying novel sequence variants of {RNA} 3d motifs.
   \newblock {\em Nucleic Acids Research\/}, {\bf 43}(15), 7504--7520.
   
   \end{thebibliography}
   

\end{document}