Revision 1 for Bioinformatics completed

Louis BECQUEY
Commit 7196427d1340f4cf45c6878c26e440a111e7e55d 7196427d 1 parent d266c643
Showing 7 changed files with 2255 additions and 527 deletions
.gitignore
README.md
RNAnet.py
known_issues.txt
known_issues_reasons.txt
regression.py
statistics.py
--- a/.gitignore
View file @7196427
+++ b/.gitignore
View file @7196427
@@ -13,3 +13,4 @@ esl*
 # environment stuff
 .vscode/
 *.pyc
+__pycache__/
\ No newline at end of file
--- a/README.md
View file @7196427
+++ b/README.md
View file @7196427
@@ -94,6 +94,8 @@ The detailed list of options is below:
 -h [ --help ]                   Print this help message
 --version                       Print the program version
+-f [ --full-inference ]         Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
+                                mapped to different families.
 -r 4.0 [ --resolution=4.0 ]     Maximum 3D structure resolution to consider a RNA chain.
 -s                              Run statistics computations after completion
 --extract                       Extract the portions of 3D RNA chains to individual mmCIF files.
@@ -105,7 +107,7 @@ The detailed list of options is below:
                                         RNAcifs/                Full structures containing RNA, in mmCIF format
                                         rna_mapped_to_Rfam/     Extracted 'pure' RNA chains
                                         datapoints/             Final results in CSV file format.
---seq-folder=…                  Path to a folder to store the sequence and alignment files.
+--seq-folder=…                  Path to a folder to store the sequence and alignment files. Subfolders will be:
                                         rfam_sequences/fasta/   Compressed hits to Rfam families
                                         realigned/              Sequences, covariance models, and alignments by family
 --no-homology                   Do not try to compute PSSMs and do not align sequences.
@@ -117,11 +119,12 @@ The detailed list of options is below:
 --update-homologous             Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
 --from-scratch                  Delete database, local 3D and sequence files, and known issues, and recompute.
 --archive                       Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
+--no-logs                       Do not save per-chain logs of the numbering modifications
 ```
 Typical usage:
 ```
-nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &
+nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
 ```
 ## Post-computation task: estimate quality
--- a/RNAnet.py
View file @7196427
+++ b/RNAnet.py
View file @7196427
 #!/usr/bin/python3.8
+import Bio
+import concurrent.futures
+import getopt
+import gzip
+import io
+import json
 import numpy as np
+import os
 import pandas as pd
-import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
+import pickle
-from Bio import AlignIO, SeqIO
+import psutil
-from Bio.PDB import MMCIFParser
+import re
-from Bio.PDB.mmcifio import MMCIFIO
+import requests
-from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
+import signal
-from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning
+import sqlalchemy
-from Bio.PDB.Dice import ChainSelector
+import sqlite3
-from Bio.Alphabet import generic_rna
+import subprocess
-from Bio.Seq import Seq
+import sys
-from Bio.SeqRecord import SeqRecord
+import time
-from Bio.Align import MultipleSeqAlignment, AlignInfo
+import traceback
-from collections import OrderedDict, defaultdict
+import warnings
 from functools import partial, wraps
-from os import path, makedirs
+from multiprocessing import Pool, Manager
-from multiprocessing import Pool, Manager, set_start_method
 from time import sleep
 from tqdm import tqdm
 from setproctitle import setproctitle
+
 def trace_unhandled_exceptions(func):
     @wraps(func)
     def wrapped_func(*args, **kwargs):
@@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func):
             print(s)
     return wrapped_func
+
 pd.set_option('display.max_rows', None)
 sqlite3.enable_callback_tracebacks(True)
 sqlite3.register_adapter(np.int64, lambda val: int(val))        # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
-sqlite3.register_adapter(np.float64, lambda val: float(val))    # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
+sqlite3.register_adapter(np.float64, lambda val: float(val))    # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
 m = Manager()
 running_stats = m.list()
@@ -52,11 +60,14 @@ validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
-LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"}   # From Rfam CLAN 00112
+LSU_set = {"RF00002", "RF02540", "RF02541",
-SSU_set = {"RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"}  # From Rfam CLAN 00111
+           "RF02543", "RF02546"}   # From Rfam CLAN 00112
+SSU_set = {"RF00177", "RF02542",  "RF02545",
+           "RF01959", "RF01960"}  # From Rfam CLAN 00111
 no_nts_set = set()
 weird_mappings = set()
+
 class SelectivePortionSelector(object):
     """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
@@ -101,7 +112,7 @@ class SelectivePortionSelector(object):
         return 1
-class BufferingSummaryInfo(AlignInfo.SummaryInfo):
+class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
     def get_pssm(self, family, index):
         """Create a position specific score matrix object for the alignment. 
@@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
                         score_dict[this_residue] = 1.0
             pssm_info.append(('*', score_dict))
-        return AlignInfo.PSSM(pssm_info)
+        return Bio.Align.AlignInfo.PSSM(pssm_info)
 class Chain:
@@ -187,11 +198,11 @@ class Chain:
         with warnings.catch_warnings():
             # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
-            warnings.simplefilter('ignore', PDBConstructionWarning)  
+            warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
-            warnings.simplefilter('ignore', BiopythonWarning)  
+            warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
             # Load the whole mmCIF into a Biopython structure object:
-            mmcif_parser = MMCIFParser()
+            mmcif_parser = Bio.PDB.MMCIFParser()
             try:
                 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
             except ValueError as e:
@@ -212,7 +223,7 @@ class Chain:
             sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
             # Save that selection on the mmCIF object s to file
-            ioobj = MMCIFIO()
+            ioobj = Bio.PDB.mmcifio.MMCIFIO()
             ioobj.set_structure(s)
             ioobj.save(self.file, sel)
@@ -253,7 +264,7 @@ class Chain:
             # Create the Pandas DataFrame for the nucleotides of the right chain
             nts = json_object["nts"]                        # sub-json-object
             df = pd.DataFrame(nts)                          # conversion to dataframe
-            df = df[ df.chain_name == self.pdb_chain_id ]    # keeping only this chain's nucleotides
+            df = df[df.chain_name == self.pdb_chain_id]     # keeping only this chain's nucleotides
             # Assert nucleotides of the chain are found
             if df.empty:
@@ -266,12 +277,12 @@ class Chain:
             # Remove low pertinence or undocumented descriptors, convert angles values
             cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
                             "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base",
-                "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering" ]
+                            "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
             df = df[cols_we_keep]
-            df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
+            df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4',  # Conversion to radians
-                        'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] *= np.pi/180.0
+                       'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0
-            df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi]
+            df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4',  # mapping [-pi, pi] into [0, 2pi]
-                        'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi)
+                       'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi)
         except KeyError as e:
             warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
@@ -295,14 +306,14 @@ class Chain:
         # Duplicate residue numbers : shift numbering
         while True in df.duplicated(['nt_resnum']).values:
             i = df.duplicated(['nt_resnum']).values.tolist().index(True)
-            duplicates = df[df.nt_resnum == df.iloc[i,1]]
+            duplicates = df[df.nt_resnum == df.iloc[i, 1]]
             n_dup = len(duplicates.nt_resnum)
             index_last_dup = duplicates.index_chain.iloc[-1] - 1
             if self.mapping is not None:
                 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
             try:
-                if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+                if i > 0 and index_last_dup + 1 < len(df.index) and df.iloc[i, 1] == df.iloc[i-1, 1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
                     # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
                     if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
@@ -314,15 +325,15 @@ class Chain:
                     else:
                         # We solve the problem continous component by continuous component
                         for j in range(1, n_dup+1):
-                            if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                            if duplicates.iloc[j, 0] == 1 + duplicates.iloc[j-1, 0]:  # continuous
-                                df.iloc[i+j-1,1] += 1
+                                df.iloc[i+j-1, 1] += 1
                             else:
                                 break
-                elif df.iloc[i,1] == df.iloc[i-1,1]:
+                elif df.iloc[i, 1] == df.iloc[i-1, 1]:
                     # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
                     # Solution : we shift the numbering of 17A (to 18) and the following residues.
                     df.iloc[i:, 1] += 1
-                elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3:
+                elif duplicates.iloc[0, 0] == 1 and df.iloc[i, 0] == 3:
                     # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
                     df.iloc[1:, 1] += 1
                     df.iloc[0, 1] = 0
@@ -340,12 +351,16 @@ class Chain:
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
-        while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
+        while ( 
-                        (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
+                len(df.index_chain) and df.iloc[-1, 2] not in ["A", "C", "G", "U"] 
-                        or (df.iloc[[-1]].puckering=='').any()
+                and (
+                    (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon",
+                                    "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
+                    or (df.iloc[[-1]].puckering == '').any()
                 )
-                or  (   len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1]    ) # large nt_resnum gap between the two last residues
+                # large nt_resnum gap between the two last residues
-                or  (   len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"]   )
+                or (len(df.index_chain) >= 2 and df.iloc[-1, 1] > 50 + df.iloc[-2, 1])
+                or (len(df.index_chain) and df.iloc[-1, 2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"])
         ):
             if self.mapping is not None:
                 self.mapping.log("Droping ligand:")
@@ -390,17 +405,19 @@ class Chain:
                         break
                 if found:
                     self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
-                    df_row = pd.DataFrame([found], index=[i])[df.columns.values]
+                    df_row = pd.DataFrame([found], index=[i])[
-                    df_row.iloc[0,0] = i+1          # index_chain
+                        df.columns.values]
-                    df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum
+                    df_row.iloc[0, 0] = i+1          # index_chain
-                    df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
+                    df_row.iloc[0, 1] = df.iloc[i, 1]  # nt_resnum
+                    df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]])
                     df.iloc[i+1:, 1] += 1
                 else:
                     warn(f"Missing index_chain {i} in {self.chain_label} !")
         # Assert some nucleotides still exist
         try:
-            l = df.iloc[-1,1] - df.iloc[0,1] + 1    # update length of chain from nt_resnum point of view
+            # update length of chain from nt_resnum point of view
+            l = df.iloc[-1, 1] - df.iloc[0, 1] + 1
         except IndexError:
             warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
                  f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
@@ -426,14 +443,17 @@ class Chain:
         # index_chain            1 |-------------|77 83|------------|  154
         # expected data point    1 |--------------------------------|  154
         #
+        
         if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
-            resnum_start = df.iloc[0,1]
+            resnum_start = df.iloc[0, 1]
-            diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)     # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
+            # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
+            diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
             for i in sorted(diff):
                 # Add a row at position i
-                df = pd.concat([    df.iloc[:i], 
+                df = pd.concat([df.iloc[:i],
-                                    pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]), 
+                                pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start,
-                                    df.iloc[i:]       ])
+                                              "nt_id": "not resolved", "nt_code": '-', "nt_name": '-'}, index=[i]),
+                                df.iloc[i:]])
                 # Increase the index_chain of all following lines
                 df.iloc[i+1:, 0] += 1
             df = df.reset_index(drop=True)
@@ -444,27 +464,27 @@ class Chain:
         #######################################
         # Add a sequence column just for the alignments
-        df['nt_align_code'] = [ str(x).upper()
+        df['nt_align_code'] = [str(x).upper()
                                .replace('NAN', '-') # Unresolved nucleotides are gaps
                                .replace('?', '-')   # Unidentified residues, let's delete them
                                .replace('T', 'U')   # 5MU are modified to t, which gives T
                                .replace('P', 'U')   # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
-                                for x in df['nt_code'] ]
+                               for x in df['nt_code']]
         # One-hot encoding sequence
-        df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
+        df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]]
-        df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
+        df["is_C"] = [1 if x == "C" else 0 for x in df["nt_code"]]
-        df["is_G"] = [ 1 if x=="G" else 0 for x in df["nt_code"] ]
+        df["is_G"] = [1 if x == "G" else 0 for x in df["nt_code"]]
-        df["is_U"] = [ 1 if x=="U" else 0 for x in df["nt_code"] ]
+        df["is_U"] = [1 if x == "U" else 0 for x in df["nt_code"]]
-        df["is_other"] = [ 0 if x in "ACGU" else 1 for x in df["nt_code"] ]
+        df["is_other"] = [0 if x in "ACGU" else 1 for x in df["nt_code"]]
         df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ]
         # Iterate over pairs to identify base-base interactions
         res_ids = list(df['nt_id'])  # things like "chainID.C4, chainID.U5"
-        paired = [ '' ] * self.full_length
+        paired = [''] * self.full_length
-        pair_type_LW = [ '' ] * self.full_length
+        pair_type_LW = [''] * self.full_length
-        pair_type_DSSR = [ '' ] * self.full_length
+        pair_type_DSSR = [''] * self.full_length
-        interacts = [ 0 ] * self.full_length
+        interacts = [0] * self.full_length
         if "pairs" in json_object.keys():
             pairs = json_object["pairs"]
             for p in pairs:
@@ -506,17 +526,19 @@ class Chain:
                         paired[nt2_idx] += ',' + str(nt1_idx + 1)
         # transform nt_id to shorter values
-        df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','').replace('/','') for n, name in zip(df.nt_id, df.nt_name) ]
+        df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ]
         df['paired'] = paired
         df['pair_type_LW'] = pair_type_LW
         df['pair_type_DSSR'] = pair_type_DSSR
         df['nb_interact'] = interacts
-        df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors
+
+        # remove now useless descriptors
+        df = df.drop(['nt_id', 'nt_resnum'], axis=1)
         self.seq = "".join(df.nt_code)
         self.seq_to_align = "".join(df.nt_align_code)
-        self.length = len([ x for x in self.seq_to_align if x != "-" ])
+        self.length = len([x for x in self.seq_to_align if x != "-"])
         # Remove too short chains
         if self.length < 5:
@@ -559,7 +581,8 @@ class Chain:
                                                                 WHERE structure_id='{self.pdb_id}' 
                                                                 AND chain_name='{self.pdb_chain_id}' 
                                                                 AND rfam_acc='{self.mapping.rfam_acc}'
-                                                    AND eq_class='{self.eq_class}';""")[0][0]
+                                                                AND eq_class='{self.eq_class}';"""
+                                                    )[0][0]
             else:
                 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) 
                                    ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
@@ -568,19 +591,18 @@ class Chain:
                                                                 WHERE structure_id='{self.pdb_id}' 
                                                                 AND chain_name='{self.pdb_chain_id}' 
                                                                 AND eq_class='{self.eq_class}'
-                                                    AND rfam_acc = 'unmappd';""")[0][0]
+                                                                AND rfam_acc = 'unmappd';"""
+                                                    )[0][0]
             # Add the nucleotides if the chain is not an issue
             if df is not None and not self.delete_me:   # double condition is theoretically redundant here, but you never know
-                sql_execute(conn, f"""
+                sql_execute(conn, f"""INSERT OR IGNORE INTO nucleotide 
-                INSERT OR IGNORE INTO nucleotide 
                                         (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
                                         epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                                         v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, 
                                         old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact)
                                         VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
-                    ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
+                                            ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
-                    ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 
                             many=True, data=list(df.to_records(index=False)), warn_every=10)
     def remap(self, columns_to_save, s_seq):
@@ -598,40 +620,39 @@ class Chain:
         # Save colums in the appropriate positions
         i = 0
         j = 0
-        while i<self.full_length and j<alilen:
+        while i < self.full_length and j < alilen:
             # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
             # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
             if self.seq_to_align[i] == s_seq[j].upper():    # alignment and sequence correspond (incl. gaps)
-                re_mappings.append( (self.db_chain_id, i+1, j+1) ) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
+                re_mappings.append((self.db_chain_id, i+1, j+1))    # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
                 columns_to_save.add(j+1)    # it's a set, doublons are automaticaly ignored
                 i += 1
                 j += 1
             elif self.seq_to_align[i] == '-':   # gap in the chain, but not in the aligned sequence
-
                 # search for a gap to the consensus nearby
                 k = 0  # Search must start at zero to assert the difference comes from '-' in front of '.'
-                while j+k<alilen and s_seq[j+k] == '.':
+                while j+k < alilen and s_seq[j+k] == '.':
                     k += 1
                 # if found, set j to that position
-                if j+k<alilen and s_seq[j+k] == '-':
+                if j+k < alilen and s_seq[j+k] == '-':
-                    re_mappings.append( (self.db_chain_id, i+1, j+k+1) )
+                    re_mappings.append((self.db_chain_id, i+1, j+k+1))
                     columns_to_save.add(j+k+1)
                     i += 1
                     j += k+1
                     continue
                 # if not, take the insertion gap if this is one
-                if j<alilen and s_seq[j] == '.':
+                if j < alilen and s_seq[j] == '.':
-                    re_mappings.append( (self.db_chain_id, i+1, j+1) )
+                    re_mappings.append((self.db_chain_id, i+1, j+1))
                     columns_to_save.add(j+1)
                     i += 1
                     j += 1
                     continue
                 # else, just mark the gap as unknown (there is an alignment mismatch)
-                re_mappings.append( (self.db_chain_id, i+1, 0) )
+                re_mappings.append((self.db_chain_id, i+1, 0))
                 i += 1
             elif s_seq[j] in ['.', '-']:  # gap in the alignment, but not in the real chain
                 j += 1  # ignore the column
@@ -672,7 +693,7 @@ class Chain:
                 l = letters[freq.index(max(freq))]
                 c_seq_to_align[i] = l
                 c_seq[i] = l
-                gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 ))
+                gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1))
         self.seq_to_align = ''.join(c_seq_to_align)
         self.seq = ''.join(c_seq)
         return gaps
@@ -684,6 +705,7 @@ class Job:
     This could be a system command or the execution of a Python function.
     Time and memory usage of a job can be monitored.
     """
+
     def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""):
         self.cmd_ = command             # A system command to run
         self.func_ = function           # A python function to run
@@ -709,7 +731,8 @@ class Job:
         if self.func_ is None:
             s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_)
         else:
-            s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")"
+            s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" \
+                + " ".join([ str(a) for a in self.args_ ]) + ")"
         return s
@@ -767,13 +790,14 @@ class Downloader:
         print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
         try:
             db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
-            mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', con=db_connection)
+            mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;',
+                con=db_connection)
             mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv")
             print(f"\t{validsymb}")
         except sqlalchemy.exc.OperationalError:  # Cannot connect :'(
             print(f"\t{errsymb}")
             # Check if a previous run succeeded (if file exists, use it)
-            if path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
+            if os.path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
                 print("\t> Using previous version.")
                 mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv")
             else:  # otherwise, abort.
@@ -791,7 +815,7 @@ class Downloader:
         setproctitle(f"RNANet.py download_Rfam_cm()")
         print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
-        if not path.isfile(path_to_seq_data + "Rfam.cm"):
+        if not os.path.isfile(path_to_seq_data + "Rfam.cm"):
             try:
                 subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"])
                 print(f"\t{validsymb}", flush=True)
@@ -815,7 +839,6 @@ class Downloader:
         try:
             db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
-
             # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family.
             q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM
                     (SELECT fr.rfam_acc, MAX(    
@@ -838,15 +861,17 @@ class Downloader:
             d = pd.read_sql(q, con=db_connection)
             # filter the results to families we are interested in
-            d = d[ d["rfam_acc"].isin(list_of_families) ]
+            d = d[d["rfam_acc"].isin(list_of_families)]
             print(d)
             with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn:
-                sql_execute(conn, """
+                # We use the REPLACE keyword to get the latest information
-                    INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
+                sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
-                    VALUES (?, ?, ?);""", many=True, data=list(d.to_records(index=False))
+                                     VALUES (?, ?, ?);""", 
-                ) # We use the replace keyword to get the latest information
+                            many=True, 
+                            data=list(d.to_records(index=False))
+                            )  
         except sqlalchemy.exc.OperationalError:
             warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.")
@@ -858,10 +883,11 @@ class Downloader:
         setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
-        if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
+        if not os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
             for _ in range(10):  # retry 100 times if it fails
                 try:
-                    subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL)
+                    subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O",
+                                    path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                     notify(f"Downloaded {rfam_acc}.fa.gz from Rfam")
                     return          # if it worked, no need to retry
                 except Exception as e:
@@ -881,8 +907,9 @@ class Downloader:
         setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
-        nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) 
+        nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
         print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
+
         # Download latest BGSU non-redundant list
         try:
             s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content
@@ -894,13 +921,13 @@ class Downloader:
             warn("Error downloading NR list !\t", error=True)
             # Try to read previous file
-            if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
+            if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
-                print("\t> Use of the previous version.\t", end = "", flush=True)
+                print("\t> Use of the previous version.\t", end="", flush=True)
             else:
                 return pd.DataFrame([], columns=["class", "class_members"])
         nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
-        full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
+        full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
         print(f"\t{validsymb}", flush=True)
         # The beginning of an adventure.
@@ -910,14 +937,15 @@ class Downloader:
         setproctitle(f"RNANet.py download_from_SILVA({unit})")
-
+        if not os.path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
-        if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
             try:
                 print(f"Downloading {unit} from SILVA...", end='', flush=True)
-                if unit=="LSU":
+                if unit == "LSU":
-                    subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
+                    subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz",
+                                    "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
                 else:
-                    subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
+                    subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz",
+                                    "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
             except:
                 warn(f"Error downloading the {unit} database from SILVA", error=True)
                 exit(1)
@@ -949,7 +977,8 @@ class Mapping:
     def filter_df(self, df):
-        newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
+        newdf = df.drop(df[(df.nt_resnum < self.nt_start) |
+                           (df.nt_resnum > self.nt_end)].index)
         if len(newdf.index_chain) > 0:
             # everything's okay
@@ -961,19 +990,20 @@ class Mapping:
             # index_chain and not nt_resnum.
             warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
             weird_mappings.add(self.chain_label + "." + self.rfam_acc)
-            df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
+            df = df.drop(df[(df.index_chain < self.nt_start) |
+                            (df.index_chain > self.nt_end)].index)
         # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
         self.st = 0
-        if len(df.index_chain) and df.iloc[0,0] != 1:
+        if len(df.index_chain) and df.iloc[0, 0] != 1:
-            self.st = df.iloc[0,0] -1
+            self.st = df.iloc[0, 0] - 1
             df.iloc[:, 0] -= self.st
             self.log(f"Shifting index_chain of {self.st}")
         # Check that some residues are not included by mistake:
         # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
         # because the icode are not read by DSSR.
-        toremove = df[ df.index_chain > self.nt_end ]
+        toremove = df[df.index_chain > self.nt_end]
         if not toremove.empty:
             df = df.drop(toremove.index)
             self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:")
@@ -991,9 +1021,9 @@ class Mapping:
         if self.logs == []:
             return  # Do not create a log file if there is nothing to log
-        if not path.exists("logs"):
+        if not os.path.exists(runDir+"/logs"):
-            os.makedirs("logs", exist_ok=True)
+            os.makedirs(runDir+"/logs", exist_ok=True)
-        with open("logs/"+filename, "w") as f:
+        with open(runDir+"/logs/"+filename, "w") as f:
             f.writelines(self.logs)
@@ -1019,20 +1049,23 @@ class Pipeline:
         self.SELECT_ONLY = None
         self.ARCHIVE = False
         self.SAVELOGS = True
+        self.FULLINFERENCE = False
     def process_options(self):
-        """Sets the paths and options of the pipeline"""
+        """Sets the paths and options of the pipeline
+        """
+        
         global path_to_3D_data
         global path_to_seq_data
         setproctitle("RNANet.py process_options()")
         try:
-            opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 
+            opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
-                                    [   "help", "resolution=", "keep-hetatm=", "from-scratch",
+                                    ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
                                         "fill-gaps=", "3d-folder=", "seq-folder=",
                                         "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
-                                        "archive", "update-homologous" ])
+                                        "archive", "update-homologous"])
         except getopt.GetoptError as err:
             print(err)
             sys.exit(2)
@@ -1044,13 +1077,15 @@ class Pipeline:
                 exit()
             if opt == "-h" or opt == "--help":
-                print(  "RNANet, a script to build a multiscale RNA dataset from public data\n"
+                print("RNANet, a script to build a multiscale RNA dataset from public data\n"
                       "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
                 print()
                 print("Options:")
                 print("-h [ --help ]\t\t\tPrint this help message")
                 print("--version\t\t\tPrint the program version")
                 print()
+                print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of chains"
+                      "\n\t\t\t\tmapped to different families.")
                 print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.")
                 print("-s\t\t\t\tRun statistics computations after completion")
                 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
@@ -1062,7 +1097,7 @@ class Pipeline:
                       "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
                       "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
                       "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
-                print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files."
+                print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
                       "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
                       "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
                 print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences."
@@ -1077,7 +1112,7 @@ class Pipeline:
                 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
                 print()
                 print("Typical usage:")
-                print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 
+                print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
                 sys.exit()
             elif opt == '--version':
                 print("RNANet 1.1 beta")
@@ -1087,21 +1122,21 @@ class Pipeline:
                 self.CRYSTAL_RES = float(arg)
             elif opt == "-s":
                 self.RUN_STATS = True
-            elif opt=="--keep-hetatm":
+            elif opt == "--keep-hetatm":
-                assert arg in [ "True", "False" ]
+                assert arg in ["True", "False"]
                 self.KEEP_HETATM = (arg == "True")
-            elif opt=="--fill-gaps":
+            elif opt == "--fill-gaps":
-                assert arg in [ "True", "False" ]
+                assert arg in ["True", "False"]
                 self.FILL_GAPS = (arg == "True")
-            elif opt=="--no-homology":
+            elif opt == "--no-homology":
                 self.HOMOLOGY = False
-            elif opt=='--3d-folder':
+            elif opt == '--3d-folder':
-                path_to_3D_data = path.abspath(arg)
+                path_to_3D_data = os.path.abspath(arg)
                 if path_to_3D_data[-1] != '/':
                     path_to_3D_data += '/'
                 print("> Storing 3D data into", path_to_3D_data)
-            elif opt=='--seq-folder':
+            elif opt == '--seq-folder':
-                path_to_seq_data = path.abspath(arg)
+                path_to_seq_data = os.path.abspath(arg)
                 if path_to_seq_data[-1] != '/':
                     path_to_seq_data += '/'
                 print("> Storing sequences into", path_to_seq_data)
@@ -1138,6 +1173,8 @@ class Pipeline:
                 self.ARCHIVE = True
             elif opt == "--no-logs":
                 self.SAVELOGS = False
+            elif opt == "-f" or opt == "--full-inference":
+                self.FULLINFERENCE = True
         if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
@@ -1149,17 +1186,19 @@ class Pipeline:
         """List 3D chains with available Rfam mappings.
         Return a list of Chain() objects with the mappings set up.        
-        If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
+        If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.
+        """
         setproctitle("RNANet.py list_available_mappings()")
         # List all 3D RNA chains below given resolution
-        full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
+        full_structures_list = self.dl.download_BGSU_NR_list(
+            self.CRYSTAL_RES)  # list of tuples ( class, class_members )
         # Check for a list of known problems:
-        if path.isfile(runDir + "/known_issues.txt"):
+        if os.path.isfile(runDir + "/known_issues.txt"):
             with open(runDir + "/known_issues.txt", 'r') as issues:
-                self.known_issues = [ x[:-1] for x in issues.readlines() ]
+                self.known_issues = [x[:-1] for x in issues.readlines()]
             if self.USE_KNOWN_ISSUES:
                 print("\t> Ignoring known issues:")
                 for x in self.known_issues:
@@ -1175,9 +1214,18 @@ class Pipeline:
             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
             try:
-                pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
+                pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1,
-                for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): 
+                            desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
+                for _, newchains in enumerate(p.imap_unordered(partial(
+                                                                    work_infer_mappings, 
+                                                                    not self.REUSE_ALL, 
+                                                                    allmappings, 
+                                                                    self.FULLINFERENCE
+                                                              ), 
+                                                              full_structures_list, 
+                                                              chunksize=1)):
                     self.update += newchains
+                    
                     pbar.update(1)  # Everytime the iteration finishes, update the global progress bar
                 pbar.close()
@@ -1192,7 +1240,7 @@ class Pipeline:
         else:
             conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
             for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"):
-                codes = codelist.replace('+',',').split(',')
+                codes = codelist.replace('+', ',').split(',')
                 # Simply convert the list of codes to Chain() objects
                 for c in codes:
@@ -1201,40 +1249,48 @@ class Pipeline:
                     pdb_model = int(nr[1])
                     pdb_chain_id = nr[2].upper()
                     chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
-                    res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
+                    res = sql_ask_database(conn, f"""SELECT chain_id from chain 
+                                                        WHERE structure_id='{pdb_id}' 
+                                                        AND chain_name='{pdb_chain_id}' 
+                                                        AND rfam_acc = 'unmappd' 
+                                                        AND issue=0""")
                     if not len(res) or self.REUSE_ALL:  # the chain is NOT yet in the database, or this is a known issue
                         self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
             conn.close()
         if self.SELECT_ONLY is not None:
-            self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ]
+            self.update = [
+                c for c in self.update if c.chain_label == self.SELECT_ONLY]
         self.n_chains = len(self.update)
         print(str(self.n_chains) + " RNA chains of interest.")
     @trace_unhandled_exceptions
-    def dl_and_annotate(self, retry=False, coeff_ncores = 0.75):
+    def dl_and_annotate(self, retry=False, coeff_ncores=0.75):
         """
         Gets mmCIF files from the PDB, and runs DSSR on them.
         Ignores a structure if the file already exists (not if we are retrying).
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
-        SETS table structure"""
+        SETS table structure
+        """
-        # setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
+        setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
         # Prepare the results folders
-        if not path.isdir(path_to_3D_data + "RNAcifs"):
+        if not os.path.isdir(path_to_3D_data + "RNAcifs"):
-            os.makedirs(path_to_3D_data + "RNAcifs")        # for the whole structures
+            # for the whole structures
-        if not path.isdir(path_to_3D_data + "annotations"):
+            os.makedirs(path_to_3D_data + "RNAcifs")
-            os.makedirs(path_to_3D_data + "annotations")    # for DSSR analysis of the whole structures
+        if not os.path.isdir(path_to_3D_data + "annotations"):
+            # for DSSR analysis of the whole structures
+            os.makedirs(path_to_3D_data + "annotations")
         # Download and annotate
         print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
         if retry:
-            mmcif_list = sorted(set([ c.pdb_id for c in self.retry ]))
+            mmcif_list = sorted(set([c.pdb_id for c in self.retry]))
         else:
-            mmcif_list = sorted(set([ c.pdb_id for c in self.update ]))
+            mmcif_list = sorted(set([c.pdb_id for c in self.update]))
         try:
             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
             pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
@@ -1255,16 +1311,19 @@ class Pipeline:
         and extract their informations from the JSON files to the database.
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
-        SETS self.loaded_chains"""
+        SETS self.loaded_chains
+        """
         setproctitle(f"RNANet.py build_chains(retry={retry})")
         # Prepare folders
         if self.EXTRACT_CHAINS:
-            if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
+            if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
-                os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
+                # for the portions mapped to Rfam
-            if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
+                os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
-                os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
+            if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
+                # extract chains of pure RNA
+                os.makedirs(path_to_3D_data + "rna_only")
         # define and run jobs
         joblist = []
@@ -1296,44 +1355,48 @@ class Pipeline:
                     issues += 1
                     issues_names.append(c[1].chain_label)
                     ki.write(c[1].chain_label + '\n')
-                    kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n')
+                    kir.write(c[1].chain_label + '\n' +
+                              c[1].error_messages + '\n\n')
                     with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
                         sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
         ki.close()
         kir.close()
         if issues:
             warn(f"Added {issues} newly discovered issues to known issues:")
-            print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True)
+            print("\033[33m" + " ".join(issues_names) + "\033[0m", flush=True)
         # Add successfully built chains to list
-        self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ]
+        self.loaded_chains += [c[1] for c in results if not c[1].delete_me]
         # Identify errors due to empty JSON files (this happen when RAM is full, we believe).
         # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage.
         self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ]
     def checkpoint_save_chains(self):
-        """Saves self.loaded_chains to data/loaded_chains.picke"""
+        """Saves self.loaded_chains to data/loaded_chains.picke
-        with open(runDir + "/data/loaded_chains.pickle","wb") as pick:
+        """
+        with open(runDir + "/data/loaded_chains.pickle", "wb") as pick:
             pickle.dump(self.loaded_chains, pick)
     def checkpoint_load_chains(self):
-        """Load self.loaded_chains from data/loaded_chains.pickle"""
+        """Load self.loaded_chains from data/loaded_chains.pickle
-        with open(runDir + "/data/loaded_chains.pickle","rb") as pick:
+        """
+        with open(runDir + "/data/loaded_chains.pickle", "rb") as pick:
             self.loaded_chains = pickle.load(pick)
     def prepare_sequences(self):
         """Downloads homologous sequences and covariance models required to compute MSAs.
         REQUIRES that self.loaded_chains is defined.
-        SETS family (partially, through call)"""
+        SETS family (partially, through call)
+        """
         setproctitle("RNANet.py prepare_sequences()")
         # Preparing a results folder
         if not os.access(path_to_seq_data + "realigned/", os.F_OK):
             os.makedirs(path_to_seq_data + "realigned/")
-        if not path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
+        if not os.path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
             os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True)
         # Update the family table (rfam_acc, description, max_len)
@@ -1344,7 +1407,8 @@ class Pipeline:
         joblist = []
         for f in self.fam_list:
-            joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[self.dl, f, rfam_acc_to_download[f]]))
+            joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
+                           self.dl, f, rfam_acc_to_download[f]]))
         try:
             execute_joblist(joblist)
@@ -1360,14 +1424,16 @@ class Pipeline:
         """Perform multiple sequence alignments.
         REQUIRES self.fam_list to be defined
-        SETS family (partially)"""
+        SETS family (partially)
+        """
         setproctitle("RNANet.py realign()")
         # Prepare the job list
         joblist = []
         for f in self.fam_list:
-            joblist.append( Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))  # the function already uses all CPUs so launch them one by one
+            # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
+            joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
         # Execute the jobs
         try:
@@ -1379,8 +1445,8 @@ class Pipeline:
         # Update the database
         data = []
         for r in results:
-            align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
+            align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
-            nb_3d_chains = len([ 1 for r in align if '[' in r.id ])
+            nb_3d_chains = len([1 for r in align if '[' in r.id])
             if r[0] in SSU_set:  # SSU v138 is used
                 nb_homologs = 2225272       # source: https://www.arb-silva.de/documentation/release-138/
                 nb_total_homol = nb_homologs + nb_3d_chains
@@ -1390,7 +1456,7 @@ class Pipeline:
             else:
                 nb_total_homol = len(align)
                 nb_homologs = nb_total_homol - nb_3d_chains
-            data.append( (nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]) )
+            data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]))
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? 
@@ -1399,13 +1465,14 @@ class Pipeline:
     def remap(self):
         """Compute nucleotide frequencies of some alignments and save them in the database
-        REQUIRES self.fam_list to be defined"""
+        REQUIRES self.fam_list to be defined
+        """
         setproctitle("RNANet.py remap()")
         print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
         print("Check your CPU and disk I/O activity before deciding if the job failed.")
-        nworkers =max(min(ncores, len(self.fam_list)), 1)
+        nworkers = max(min(ncores, len(self.fam_list)), 1)
         # Prepare the architecture of a shiny multi-progress-bars design
                                                 # Push the number of workers to a queue.
@@ -1419,8 +1486,10 @@ class Pipeline:
         try:
             fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
-            for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family
+            # Apply work_pssm to each RNA family
-                fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
+            for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
+                # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
+                fam_pbar.update(1)
             fam_pbar.close()
             p.close()
             p.join()
@@ -1434,23 +1503,24 @@ class Pipeline:
     def output_results(self):
         """Produces CSV files, archive them, and additional metadata files
-        REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)"""
+        REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)
+        """
         setproctitle("RNANet.py output_results()")
         time_str = time.strftime("%Y%m%d")
-        #Prepare folders:
+        # Prepare folders:
-        if not path.isdir(path_to_3D_data + "datapoints/"):
+        if not os.path.isdir(path_to_3D_data + "datapoints/"):
             os.makedirs(path_to_3D_data + "datapoints/")
-        if not path.isdir(runDir + "/results/archive/"):
+        if not os.path.isdir(runDir + "/results/archive/"):
             os.makedirs(runDir + "/results/archive/")
         # Save to by-chain CSV files
         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
         try:
             pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
-            for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)):
+            for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
                 pbar.update(1)
             pbar.close()
             p.close()
@@ -1465,36 +1535,44 @@ class Pipeline:
         # Run statistics
         if self.RUN_STATS:
             # Remove previous precomputed data
-            subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
+            subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 
+                                        runDir + "/data/wadley_kernel_eta_prime.npz", 
+                                        runDir + "/data/pair_counts.csv"])
             for f in self.fam_list:
-                subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
+                subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 
+                                            runDir + f"/data/{f}_pairs.csv", 
+                                            runDir + f"/data/{f}_counts.csv"])
             # Run statistics files
-            os.chdir(runDir)
+            subprocess.run(["python3.8", fileDir+"/regression.py"])
-            subprocess.run(["python3.8", "regression.py"])
+            subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder",  path_to_3D_data, 
-            subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
+                            "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
         # Save additional informations
         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
-            pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+            pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem 
+                                 FROM family ORDER BY nb_3d_chains DESC;""",
                               conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-            pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+            pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue 
+                                 FROM structure 
                                  JOIN chain ON structure.pdb_id = chain.structure_id
-                                ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
+                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""",
+                              conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
         # Archive the results
-        if self.SELECT_ONLY is None:
+        if self.ARCHIVE:
-            os.makedirs("results/archive", exist_ok=True)
+            os.makedirs(runDir + "/results/archive", exist_ok=True)
-            subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
+            subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf",
+                            runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
             # Update shortcuts to latest versions
             subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
                             runDir + "/results/summary_latest.csv",
                             runDir + "/results/families_latest.csv"
                             ])
-        subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
+            subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
-        subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
+            subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
-        subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
+            subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
@@ -1518,7 +1596,9 @@ class Pipeline:
         if self.HOMOLOGY:
             # check if chains have been re_mapped:
             r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain 
-                                          WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
+                                          WHERE issue = 0 
+                                                AND rfam_acc != 'unmappd'
+                                                AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
                                           GROUP BY rfam_acc;""")
             try:
                 if len(r) and r[0][0] is not None:
@@ -1545,22 +1625,25 @@ class Pipeline:
 def read_cpu_number():
-    # As one shall not use os.cpu_count() on LXC containers,
+    """This function reads the number of CPU cores available from /proc/cpuinfo.
-    # because it reads info from /sys wich is not the VM resources but the host resources.
+    One shall not use os.cpu_count() on LXC containers,
-    # This function reads it from /proc/cpuinfo instead.
+    because it reads info from /sys wich is not the VM resources but the host resources.
+    """
     p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
     return int(int(p.stdout.decode('utf-8')[:-1])/2)
+
 def init_worker(tqdm_lock=None):
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if tqdm_lock is not None:
         tqdm.set_lock(tqdm_lock)
+
 def warn(message, error=False):
     """Pretty-print warnings and error messages.
     """
     # Cut if too long
-    if len(message)>66:
+    if len(message) > 66:
         x = message.find(' ', 50, 66)
         if x != -1:
             warn(message[:x], error=error)
@@ -1574,11 +1657,13 @@ def warn(message, error=False):
     else:
         print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
+
 def notify(message, post=''):
     if len(post):
         post = '(' + post + ')'
     print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
+
 def sql_define_tables(conn):
     conn.executescript(
         """ PRAGMA foreign_keys = on;
@@ -1684,8 +1769,9 @@ def sql_define_tables(conn):
          """)
     conn.commit()
+
 @trace_unhandled_exceptions
-def sql_ask_database(conn, sql, warn_every = 10):
+def sql_ask_database(conn, sql, warn_every=10):
     """
     Reads the SQLite database.
     Returns a list of tuples.
@@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10):
             return result         # if it worked, no need to retry
         except sqlite3.OperationalError as e:
             if warn_every and not (_+1) % warn_every:
-                warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
+                warn(str(e) + ", retrying in 0.2s (worker " +
+                     str(os.getpid()) + f', try {_+1}/100)')
             time.sleep(0.2)
     warn("Tried to reach database 100 times and failed. Aborting.", error=True)
     return []
+
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
@@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
             return          # if it worked, no need to retry
         except sqlite3.OperationalError as e:
             if warn_every and not (_+1) % warn_every:
-                warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
+                warn(str(e) + ", retrying in 0.2s (worker " +
+                     str(os.getpid()) + f', try {_+1}/100)')
             time.sleep(0.2)
     warn("Tried to reach database 100 times and failed. Aborting.", error=True)
+
 @trace_unhandled_exceptions
 def execute_job(j, jobcount):
     """Run a Job object.
@@ -1741,7 +1831,8 @@ def execute_job(j, jobcount):
         print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
         # Add the command to logfile
-        logfile = open(runDir + "/log_of_the_run.sh", 'a')
+        os.makedirs(runDir+"/logs", exist_ok=True)
+        logfile = open(runDir + "/logs/log_of_the_run.sh", 'a')
         logfile.write(" ".join(j.cmd_))
         logfile.write("\n")
         logfile.close()
@@ -1753,7 +1844,8 @@ def execute_job(j, jobcount):
             # run the command. subprocess.run will be a child of this process, and stays monitored.
             start_time = time.time()
-            r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            r = subprocess.run(j.cmd_, timeout=j.timeout_,
+                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             end_time = time.time()
             # Stop the Monitor, then get its result
@@ -1782,7 +1874,8 @@ def execute_job(j, jobcount):
     # return time and memory statistics, plus the job results
     t = end_time - start_time
-    return (t,m,r)
+    return (t, m, r)
+
 def execute_joblist(fulljoblist):
     """ Run a list of job objects.
@@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist):
     # Process the jobs from priority 1 to nprio
     results = []
-    for i in range(1,nprio+1):
+    for i in range(1, nprio+1):
-        if i not in jobs.keys(): continue # no job has the priority level i
+        if i not in jobs.keys():
+            continue  # no job has the priority level i
         print("processing jobs of priority", i)
         different_thread_numbers = sorted(jobs[i].keys())
@@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist):
         for n in different_thread_numbers:
             # get the bunch of jobs of same priority and thread number
             bunch = jobs[i][n]
-            if not len(bunch): continue # no jobs should be processed n by n
+            if not len(bunch):
+                continue  # no jobs should be processed n by n
             print("using", n, "processes:")
             # execute jobs of priority i that should be processed n by n:
@@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist):
             for j, r in zip(bunch, raw_results):
                 j.comp_time = round(r[0], 2)  # seconds
                 j.max_mem = int(r[1]/1000000)  # MB
-                results.append( (j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
+                results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
     # throw back the money
     return results
+
 @trace_unhandled_exceptions
-def work_infer_mappings(update_only, allmappings, codelist) -> list:
+def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
     """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 
     build a list of Chain() objects mapped to Rfam families, by expanding available mappings 
     of any element of the list to all the list elements.
@@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
     # Split the comma-separated list of chain codes into chain codes:
     eq_class = codelist[0]
-    codes = codelist[1].replace('+',',').split(',')
+    codes = codelist[1].replace('+', ',').split(',')
     # Search for mappings that apply to an element of this PDB chains list:
     for c in codes:
         # search for Rfam mappings with this chain c:
         m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:]
-        m = allmappings.loc[m_row_indices].drop(['bit_score','evalue_score','cm_start','cm_end','hex_colour'], axis=1)
+        m = allmappings.loc[m_row_indices].drop(['bit_score', 'evalue_score', 'cm_start', 'cm_end', 'hex_colour'], axis=1)
         if len(m):
             # remove the found mappings from the dataframe
             allmappings = allmappings.loc[m_row_indices == False]
@@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
         families = set(known_mappings['rfam_acc'])
         # generalize
-        inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates()
+        inferred_mappings = known_mappings.drop(['pdb_id', 'chain'], axis=1).drop_duplicates()
         # check for approximative redundancy:
         if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
@@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
             # ==> Summarize them in one mapping but with the largest window.
             for rfam in families:
                 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
-                thisfam_5_3 =  (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
+                thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
-                thisfam_3_5 =  (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
+                thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
                 if (
-                        len(inferred_mappings[thisfam_5_3]) !=  len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ])
+                        len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[inferred_mappings['rfam_acc'] == rfam])
                     and len(inferred_mappings[thisfam_5_3]) > 0
                 ):
                     # there are mappings in both directions... wtf Rfam ?!
@@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
                         # We keep only the 5->3 sense.
                         inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
                         sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
-                        thisfam_5_3 =  (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
+                        thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
-                        thisfam_3_5 =  (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
+                        thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
                         print()
                         warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
                     else:
@@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
                 # Compute consensus for chains in 5' -> 3' sense
                 if len(inferred_mappings[thisfam_5_3]):
-                    pdb_start_min = min(inferred_mappings[ thisfam_5_3]['pdb_start'])
+                    pdb_start_min = min(inferred_mappings[thisfam_5_3]['pdb_start'])
-                    pdb_end_max = max(inferred_mappings[ thisfam_5_3]['pdb_end']) 
+                    pdb_end_max = max(inferred_mappings[thisfam_5_3]['pdb_end'])
-                    pdb_start_max = max(inferred_mappings[ thisfam_5_3]['pdb_start'])
+                    pdb_start_max = max(inferred_mappings[thisfam_5_3]['pdb_start'])
-                    pdb_end_min = min(inferred_mappings[ thisfam_5_3]['pdb_end'])
+                    pdb_end_min = min(inferred_mappings[thisfam_5_3]['pdb_end'])
                     if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
                         # the variation is only a few nucleotides, we take the largest window.
-                        inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = pdb_start_min
+                        inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = pdb_start_min
-                        inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = pdb_end_max
+                        inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = pdb_end_max
                     else:
                         # there probably is an outlier. We chose the median value in the whole list of known_mappings.
-                        known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
+                        known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
-                        inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
+                        inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
-                        inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
+                        inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
                 #  Compute consensus for chains in 3' -> 5' sense
                 if len(inferred_mappings[thisfam_3_5]):
-                    pdb_start_min = min(inferred_mappings[ thisfam_3_5]['pdb_start'])
+                    pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start'])
-                    pdb_end_max = max(inferred_mappings[ thisfam_3_5]['pdb_end']) 
+                    pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end'])
-                    pdb_start_max = max(inferred_mappings[ thisfam_3_5]['pdb_start'])
+                    pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start'])
-                    pdb_end_min = min(inferred_mappings[ thisfam_3_5]['pdb_end'])
+                    pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end'])
                     if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
                         # the variation is only a few nucleotides, we take the largest window.
-                        inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = pdb_start_max
+                        inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max
-                        inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = pdb_end_min
+                        inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min
                     else:
                         # there probably is an outlier. We chose the median value in the whole list of known_mappings.
-                        known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
+                        known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
-                        inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
+                        inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
-                        inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
+                        inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
             inferred_mappings.drop_duplicates(inplace=True)
         # Now build Chain() objects for the mapped chains
@@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
             pdb_chain_id = nr[2]
             for rfam in families:
                 # if a known mapping of this chain on this family exists, apply it
-                m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ]
+                this_chain_idxs = (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:])
+                m = known_mappings.loc[this_chain_idxs & (known_mappings['rfam_acc'] == rfam)]
                 if len(m) and len(m) < 2:
                     pdb_start = int(m.pdb_start)
                     pdb_end = int(m.pdb_end)
@@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
                     pdb_start = int(m.pdb_start.min())
                     pdb_end = int(m.pdb_end.max())
                     inferred = False
-                elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping
+                elif (fullinference or not(this_chain_idxs.any())): 
-                    pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
+                    # if no known mapping on another family, use the inferred mapping
-                    pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
+                    # idem if the user said to do so with --full-inference
+                    pdb_start = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_start)
+                    pdb_end = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_end)
                     inferred = True
+                else:
+                    # skip this family, we cannot map this chain to it.
+                    continue
                 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
                 # Check if the chain exists in the database
                 if update_only:
                     with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
-                        res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
+                        res = sql_ask_database(conn, f"""SELECT chain_id from chain 
+                                                         WHERE structure_id='{pdb_id}' 
+                                                         AND chain_name='{pdb_chain_id}' 
+                                                         AND rfam_acc='{rfam}' 
+                                                         AND issue=0""")
                     if not len(res):  # the chain is NOT yet in the database, or this is a known issue
-                        newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
+                        newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
+                                               rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
                 else:
-                    newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
+                    newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
+                                           rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
     return newchains
+
 @trace_unhandled_exceptions
 def work_mmcif(pdb_id):
     """ Look for a CIF file (with all chains) from RCSB
@@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id):
     # Attempt to download it if not present
     try:
-        if not path.isfile(final_filepath):
+        if not os.path.isfile(final_filepath):
-            subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            subprocess.run(
+                ["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath],
+                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
     except:
         warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
         return
@@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id):
     # if not, read the CIF header and register the structure
     if not len(r):
         # Load the MMCIF file with Biopython
-        mmCif_info = MMCIF2Dict(final_filepath)
+        mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
         # Get info about that structure
         try:
@@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id):
         # Save into the database
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
-                                VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
+                                VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso))
-    if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
+    if not os.path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
         # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
         output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"],
@@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id):
             return 1
         # save the analysis to file only if we can load it :/
-        json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
+        json_file = open(path_to_3D_data + "annotations/" +
+                         pdb_id + ".json", "w")
         json_file.write(stdout)
         json_file.close()
     return 0
+
 @trace_unhandled_exceptions
 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     """Reads information from JSON and save it to database.
     If asked, also extracts the 3D chains from their original structure files.
-
     """
     setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
-    if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
+    if not os.path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
         warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
         c.delete_me = True
         c.error_messages += f"Could not download and/or find annotations for {c.chain_label}."
@@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     return c
+
 @trace_unhandled_exceptions
 def work_prepare_sequences(dl, rfam_acc, chains):
-    """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
+    """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
+    """
     setproctitle("RNAnet.py work_prepare_sequences()")
     if rfam_acc in LSU_set | SSU_set:  # rRNA
-        if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
+        if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
-            existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
+            existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
-            existing_ids = [ r.id for r in existing_afa ]
+            existing_ids = [r.id for r in existing_afa]
             del existing_afa
-            new_ids = [ str(c) for c in chains ]
+            new_ids = [str(c) for c in chains]
-            doublons = [ i for i in existing_ids if i in new_ids ]
+            doublons = [i for i in existing_ids if i in new_ids]
             del existing_ids, new_ids
             if len(doublons):
-                fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
-                seqfile = SeqIO.parse(fasta, "fasta")
+                fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
+                seqfile = Bio.SeqIO.parse(fasta, "fasta")
+                # remove it and rewrite it with its own content filtered
                 os.remove(fasta)
                 with open(fasta, 'w') as f:
                     for rec in seqfile:
@@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
             for c in chains:
                 if len(c.seq_to_align):
-                    f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                    f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
         status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)"
-
+    elif not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
-    elif not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
         # there was no previous aligned sequences, and we use cmalign.
         # So, we need to download homologous sequences from Rfam.
         # Extracting covariance model for this family
-        if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
+        if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
             with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f:
                 subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f)
             notify(f"Extracted {rfam_acc} covariance model (cmfetch)")
@@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         dl.download_Rfam_sequences(rfam_acc)
         # Prepare a FASTA file containing Rfamseq hits for that family
-        if path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):    # test if download succeeded
+        if os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded
             # gunzip the file
             with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz:
@@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
                 ids = set()
                 # Remove doublons from the Rfam hits
-                for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
+                for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
                     if r.id not in ids:
                         ids.add(r.id)
                         plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
                 # Add the 3D chains sequences
                 for c in chains:
                     if len(c.seq_to_align):
-                        plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                        plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
             del file_content
             # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa")
@@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f:
             for c in chains:
                 if len(c.seq_to_align):
-                    f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                    f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
         status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)"
     # print some stats
     notify(status)
+
 @trace_unhandled_exceptions
 def work_realign(rfam_acc):
     """ Runs multiple sequence alignements by RNA family.
@@ -2209,10 +2325,10 @@ def work_realign(rfam_acc):
     else:
         # Align using Infernal for most RNA families
-        if path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
+        if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
             # Alignment exists. We just want to add new sequences into it.
-            if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
+            if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
                 # there are no new sequences to align...
                 return
@@ -2227,13 +2343,13 @@ def work_realign(rfam_acc):
             notify("Aligned new sequences together")
             # Detect doublons and remove them
-            existing_stk = AlignIO.read(existing_ali_path, "stockholm")
+            existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
-            existing_ids = [ r.id for r in existing_stk ]
+            existing_ids = [r.id for r in existing_stk]
             del existing_stk
-            new_stk = AlignIO.read(new_ali_path, "stockholm")
+            new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
-            new_ids = [ r.id for r in new_stk ]
+            new_ids = [r.id for r in new_stk]
             del new_stk
-            doublons = [ i for i in existing_ids if i in new_ids ]
+            doublons = [i for i in existing_ids if i in new_ids]
             del existing_ids, new_ids
             if len(doublons):
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
@@ -2241,12 +2357,13 @@ def work_realign(rfam_acc):
                     toremove.write('\n'.join(doublons)+'\n')
                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+                p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
+                                   stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
                 os.remove(path_to_seq_data + "realigned/toremove.txt")
             # And we merge the two alignments
-            p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", 
+            p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
-                                                "--rna", existing_ali_path, new_ali_path ], 
+                                 "--rna", existing_ali_path, new_ali_path],
                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
             stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
             subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
@@ -2263,7 +2380,7 @@ def work_realign(rfam_acc):
             p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc",
                                 '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
                                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                path_to_seq_data + f"realigned/{rfam_acc}++.fa" ], 
+                                path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
             stderr = p.stderr.decode("utf-8")
@@ -2277,7 +2394,9 @@ def work_realign(rfam_acc):
             print('\t'+validsymb, flush=True)
         # Convert Stockholm to aligned FASTA
-        subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
+        subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
+                        "--informat", "stockholm", 
+                        "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
         subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
     # Assert everything worked, or save an error
@@ -2288,6 +2407,7 @@ def work_realign(rfam_acc):
             with open(runDir + "/errors.txt", "a") as er:
                 er.write(f"Failed to realign {rfam_acc} (killed)")
+
 def summarize_position(counts):
     """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
     """
@@ -2303,15 +2423,15 @@ def summarize_position(counts):
             N += counts[char]  # number of ungapped residues
     if N:  # prevent division by zero if the column is only gaps
-        return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
+        return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N)  # other residues, or consensus (N, K, Y...)
     else:
         return (0, 0, 0, 0, 0)
+
 @trace_unhandled_exceptions
 def work_pssm(f, fill_gaps):
     """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
-    Also saves every chain of the family to file.
     Uses only 1 core, so this function can be called in parallel.
     """
@@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps):
     # get the chains of this family
     list_of_chains = rfam_acc_to_download[f]
-    chains_ids = [ str(c) for c in list_of_chains ]
+    chains_ids = [str(c) for c in list_of_chains]
     # Open the alignment
     try:
-        align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
+        align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
     except:
         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         with open(runDir + "/errors.txt", "a") as errf:
             errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
         return 1
-
     # Compute statistics per column
     pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
     frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
@@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps):
     # Save the re_mappings
     conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
-    sql_execute(conn, "INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;", many=True, data=re_mappings)
+    sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
+                         VALUES (?, ?, ?) 
+                         ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
+                many=True, data=re_mappings)
     # Save the useful columns in the database
-    data = [ (f, j) + frequencies[j-1] for j in sorted(columns_to_save) ]
+    data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
                          VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
@@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps):
         pbar.close()
         sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, 
                               is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
-                              WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps)
+                              WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
     conn.close()
     idxQueue.put(thr_idx)  # replace the thread index in the queue
     return 0
+
 @trace_unhandled_exceptions
 def work_save(c, homology=True):
@@ -2451,9 +2574,11 @@ def work_save(c, homology=True):
     df.to_csv(filename, float_format="%.2f", index=False)
+
 if __name__ == "__main__":
-    runDir = path.dirname(path.realpath(__file__))
+    runDir = os.getcwd()
+    fileDir = os.path.dirname(os.path.realpath(__file__))
     ncores = read_cpu_number()
     pp = Pipeline()
     pp.process_options()
@@ -2502,7 +2627,6 @@ if __name__ == "__main__":
         print("Completed.")
         exit(0)
-    
     # At this point, structure, chain and nucleotide tables of the database are up to date.
     # (Modulo some statistics computed by statistics.py)
@@ -2511,13 +2635,14 @@ if __name__ == "__main__":
     # ===========================================================================
     if pp.SELECT_ONLY is None:
-        pp.checkpoint_load_chains()  # If your job failed, you can comment all the "3D information" part and start from here.
+        # If your job failed, you can comment all the "3D information" part and start from here.
+        pp.checkpoint_load_chains()
     # Get the list of Rfam families found
     rfam_acc_to_download = {}
     for c in pp.loaded_chains:
         if c.mapping.rfam_acc not in rfam_acc_to_download:
-            rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
+            rfam_acc_to_download[c.mapping.rfam_acc] = [c]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
@@ -2546,5 +2671,5 @@ if __name__ == "__main__":
     print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
-    # # so i can sleep for the end of the night
+    # so i can sleep for the end of the night
     # subprocess.run(["poweroff"])
--- a/known_issues.txt
View file @7196427
+++ b/known_issues.txt
View file @7196427
-1ml5_1_a_1-2914
+1eg0_1_O_1-73
-1ml5_1_a_151-2903
-1ml5_1_A_7-1515
-1ml5_1_A_2-1520
-1ml5_1_A_7-1518
-1ml5_1_b_5-121
 2rdo_1_A_3-118
 4v48_1_A9_3-118
 4v47_1_A9_3-118
-6zmi_1_L8_1267-4755
-6zm7_1_L8_1267-4755
-6y6x_1_L8_1267-4755
-6z6n_1_L8_1267-4755
-6qzp_1_L8_1267-4755
-6zme_1_L8_1267-4755
-6z6l_1_L8_1267-4755
-6ek0_1_L8_1267-4755
-6zmo_1_L8_1267-4755
-6z6m_1_L8_1267-4755
-6ole_1_D_1267-4755
-6om0_1_D_1267-4755
-6y2l_1_L8_1267-4755
-6lqm_1_8_1267-4755
-6y0g_1_L8_1267-4755
-6lu8_1_8_1267-4755
-6lsr_1_8_1267-4755
-6lss_1_8_1267-4755
-6oli_1_D_1267-4755
-6olg_1_A3_1267-4755
-6y57_1_L8_1267-4755
-5t2c_1_C_1267-4755
-6om7_1_D_1267-4755
-4ug0_1_L8_1267-4755
-6olf_1_D_1267-4755
-6ip5_1_1C_1267-4755
-6ip8_1_1C_1267-4755
-6olz_1_A3_1267-4755
-5aj0_1_A3_1267-4755
-5lks_1_L8_1267-4755
-6ip6_1_1C_1267-4755
-4v6x_1_A8_1267-4755
 1vy7_1_AY_1-73
 1vy7_1_CY_1-73
 4w2h_1_CY_1-73
-2z9q_1_A_1-72
+1jgq_1_A_2-1520
+4v42_1_AA_2-1520
+1jgo_1_A_2-1520
+1jgp_1_A_2-1520
+1ml5_1_A_2-1520
+4v42_1_BA_1-2914
+1ml5_1_a_1-2914
 4v42_1_BB_5-121
+1ml5_1_b_5-121
+2rdo_1_B_1-2904
+4v48_1_A0_1-2904
+4v47_1_A0_1-2904
+4v48_1_BA_1-1543
+4v47_1_BA_1-1542
 1ls2_1_B_1-73
 3ep2_1_Y_1-72
 3eq3_1_Y_1-72
 4v48_1_A6_1-73
-1eg0_1_O_1-73
+2z9q_1_A_1-72
 1gsg_1_T_1-72
 3jcr_1_H_1-115
-4v42_1_BA_1-2914
+1x1l_1_A_1-132
-4v42_1_BA_151-2903
+1zc8_1_Z_1-93
-4v48_1_BA_1-91
+2ob7_1_D_1-132
-4v48_1_BA_6-1541
-4v48_1_BA_1-1543
-4v48_1_BA_6-1538
-4v47_1_BA_1-91
-4v47_1_BA_6-1540
-4v47_1_BA_1-1542
-4v47_1_BA_6-1537
-2rdo_1_B_1-2903
-2rdo_1_B_6-1460
-2rdo_1_B_1-1528
-2rdo_1_B_6-1457
-2rdo_1_B_160-2893
-2rdo_1_B_1-2904
-2rdo_1_B_6-1522
-4v48_1_A0_1-2903
-4v48_1_A0_6-1460
-4v48_1_A0_1-1528
-4v48_1_A0_6-1457
-4v48_1_A0_160-2893
-4v48_1_A0_1-2904
-4v48_1_A0_6-1522
-4v47_1_A0_1-2903
-4v47_1_A0_6-1460
-4v47_1_A0_1-1528
-4v47_1_A0_6-1457
-4v47_1_A0_160-2893
-4v47_1_A0_1-2904
-4v47_1_A0_6-1522
 2ob7_1_A_10-319
-1x1l_1_A_1-130
-1zc8_1_Z_1-130
-1zc8_1_Z_1-91
-2ob7_1_D_1-130
-6rxu_1_C2_588-2386
-6rxu_1_C2_583-2388
-6rxu_1_C2_588-2383
-5oql_1_2_588-2386
-5oql_1_2_583-2388
-5oql_1_2_588-2383
-6rxv_1_C2_588-2386
-6rxv_1_C2_583-2388
-6rxv_1_C2_588-2383
-6rxz_1_C2_588-2386
-6rxz_1_C2_583-2388
-6rxz_1_C2_588-2383
-6rxy_1_C2_588-2386
-6rxy_1_C2_583-2388
-6rxy_1_C2_588-2383
-6rxt_1_C2_588-2386
-6rxt_1_C2_583-2388
-6rxt_1_C2_588-2383
 1r2x_1_C_1-58
 1r2w_1_C_1-58
-1eg0_1_L_1-57
 1eg0_1_L_1-56
-1jgq_1_A_7-1518
-1jgq_1_A_20-55
-1jgq_1_A_2-1520
-1jgq_1_A_7-1515
-4v42_1_AA_7-1518
-4v42_1_AA_20-55
-4v42_1_AA_2-1520
-4v42_1_AA_7-1515
-1jgo_1_A_7-1518
-1jgo_1_A_20-55
-1jgo_1_A_2-1520
-1jgo_1_A_7-1515
-1jgp_1_A_7-1518
-1jgp_1_A_20-55
-1jgp_1_A_2-1520
-1jgp_1_A_7-1515
 1zc8_1_A_1-59
-1mvr_1_D_1-59
+1mvr_1_D_1-61
-4c9d_1_D_29-1
+4adx_1_9_1-123
-4c9d_1_C_29-1
-4adx_1_9_1-121
 1zn1_1_B_1-59
 1emi_1_B_1-108
 3iy9_1_A_498-1027
@@ -143,25 +49,1558 @@
 3cw1_1_V_1-138
 3cw1_1_v_1-138
 2iy3_1_B_9-105
-3jcr_1_N_1-188
+3jcr_1_N_1-107
-3jcr_1_N_1-106
 2vaz_1_A_64-177
-2ftc_1_R_1-1568
-2ftc_1_R_792-1568
 2ftc_1_R_81-1466
 3jcr_1_M_1-141
-3jcr_1_M_1-188
-3jcr_1_M_1-107
-4v5z_1_B0_1-2899
 4v5z_1_B0_1-2902
-4v5z_1_B0_1-2840
 5g2x_1_A_595-692
 3iy8_1_A_1-540
 4v5z_1_BY_2-113
 4v5z_1_BZ_1-70
-1mvr_1_B_1-96
+4v5z_1_B1_2-125
-4adx_1_0_1-2923
+1mvr_1_B_3-96
-4adx_1_0_132-2915
+4adx_1_0_1-2925
 3eq4_1_Y_1-69
+6uz7_1_8_2140-2827
 4v5z_1_AA_1-1563
-4v5z_1_AA_1-1562
+6cfj_1_1X
+6cfj_1_2X
+5hcq_1_1X
+6cae_1_1X
+5hcq_1_2X
+5hcr_1_1X
+4z8c_1_1X
+5j4b_1_1X
+5j4b_1_2X
+4z8c_1_2X
+6cae_1_2X
+5j4c_1_1X
+5w4k_1_1X
+6of1_1_1X
+5hcr_1_2X
+5hd1_1_1X
+5hcp_1_1X
+6of1_1_2X
+5hau_1_1W
+5j4c_1_2X
+5wis_1_1X
+6xqd_1_1X
+6nd5_1_1X
+5w4k_1_2X
+5hau_1_2W
+6xqd_1_2X
+4y4p_1_1X
+6o97_1_1X
+5hcp_1_2X
+5doy_1_1X
+4zer_1_1X
+5wit_1_1X
+5hd1_1_2X
+6nd5_1_2X
+4z3s_1_1X
+7jql_1_1X
+7jqm_1_1X
+7jql_1_2X
+5wis_1_2X
+6nd6_1_1X
+6o97_1_2X
+4y4p_1_2X
+7jqm_1_2X
+4z3s_1_2X
+4zer_1_2X
+6uo1_1_2X
+6uo1_1_1X
+5doy_1_2X
+5wit_1_2X
+5f8k_1_1X
+6nd6_1_2X
+6xqe_1_1X
+6xqe_1_2X
+6n9e_1_1X
+6n9e_1_2X
+6n9f_1_1X
+5f8k_1_2X
+6n9f_1_2X
+6xz7_1_F
+6y69_1_W
+5afi_1_V
+5afi_1_W
+6h4n_1_W
+5wdt_1_V
+5wfs_1_V
+5wdt_1_W
+5wfs_1_W
+5we4_1_V
+5we4_1_W
+5uq8_1_Y
+6c4i_1_Y
+6c4i_1_X
+5zeb_1_V
+5zep_1_W
+5lzd_1_V
+5we6_1_V
+5wfk_1_V
+5wfk_1_W
+5we6_1_W
+5u4i_1_Y
+5uq7_1_Y
+5u4i_1_X
+5lza_1_V
+5wf0_1_V
+5wf0_1_W
+5zeu_1_V
+5l3p_1_X
+3jcj_1_V
+6gxm_1_X
+6gwt_1_X
+6gxn_1_X
+6gxo_1_X
+3j9y_1_V
+6o9k_1_Y
+6o7k_1_V
+5lzf_1_V
+3jcn_1_V
+5lzc_1_V
+5u4j_1_X
+5u4j_1_Z
+5lzb_1_V
+6h58_1_W
+6h58_1_WW
+1eg0_1_O
+5j8b_1_X
+4v7j_1_AV
+4v7j_1_BV
+4v7k_1_BV
+4v7k_1_AV
+4v7k_1_BW
+4v7k_1_AW
+4v7j_1_AW
+4v7j_1_BW
+4v4j_1_Z
+6i0v_1_B
+5k77_1_X
+5k77_1_V
+5k77_1_Y
+5k77_1_W
+5k77_1_Z
+4pei_1_X
+4pei_1_V
+4pei_1_W
+4pei_1_Z
+4pei_1_Y
+4a3c_1_P
+4a3e_1_P
+6lkq_1_U
+7k00_1_B
+6qdw_1_A
+2rdo_1_A
+4v48_1_A9
+4v47_1_A9
+6hcj_1_Q3
+6hcq_1_Q3
+5mmm_1_Z
+4w2e_1_W
+5j4b_1_1Y
+6cfj_1_1W
+5w4k_1_1Y
+5wit_1_1W
+6cfj_1_1Y
+6cfj_1_2W
+5j4c_1_1W
+5wis_1_1Y
+5j4c_1_1Y
+6cfj_1_2Y
+5wis_1_1W
+5j4b_1_1W
+5j4c_1_2W
+5j4b_1_2W
+5j4b_1_2Y
+5j4c_1_2Y
+5w4k_1_1W
+6nd5_1_1Y
+5wis_1_2Y
+5wit_1_2W
+5doy_1_1Y
+5w4k_1_2Y
+4y4p_1_1Y
+4z3s_1_1Y
+5doy_1_1W
+5doy_1_2Y
+6nd5_1_1W
+4z3s_1_2Y
+4z3s_1_1W
+5w4k_1_2W
+6nd5_1_2Y
+4y4p_1_2Y
+6uo1_1_2Y
+6uo1_1_2W
+4y4p_1_1W
+4z3s_1_2W
+6uo1_1_1Y
+6uo1_1_1W
+5wis_1_2W
+5wit_1_1Y
+6nd5_1_2W
+4y4p_1_2W
+5doy_1_2W
+5wit_1_2Y
+6ucq_1_1Y
+4v4i_1_Z
+6ucq_1_1X
+6ucq_1_2Y
+4w2e_1_X
+6ucq_1_2X
+6yss_1_W
+5afi_1_Y
+5uq8_1_Z
+5wdt_1_Y
+5wfs_1_Y
+6ysr_1_W
+5we4_1_Y
+6yst_1_W
+5uq7_1_Z
+5we6_1_Y
+5wfk_1_Y
+5wf0_1_Y
+6o9j_1_V
+6ysu_1_W
+3j46_1_A
+5j8b_1_Y
+5j8b_1_W
+3bbv_1_Z
+5aj0_1_BV
+5aj0_1_BW
+4wt8_1_AB
+4wt8_1_BB
+4v4j_1_Y
+4v4i_1_Y
+5uq8_1_X
+5uq7_1_X
+1jgq_1_A
+4v42_1_AA
+1jgo_1_A
+1jgp_1_A
+1ml5_1_A
+4v4j_1_W
+4v4i_1_W
+4v42_1_BA
+4wt8_1_CS
+4wt8_1_DS
+4v4j_1_X
+4v4i_1_X
+4v42_1_BB
+6uu4_1_333
+6uu0_1_333
+6uuc_1_333
+6uu2_1_333
+6b6h_1_3
+6pb4_1_3
+6d30_1_C
+6j7z_1_C
+3er9_1_D
+5kal_1_Y
+4nia_1_3
+5kal_1_Z
+4nia_1_7
+4nia_1_4
+5new_1_C
+4nia_1_U
+4nia_1_6
+4oq9_1_7
+4nia_1_1
+4oq9_1_4
+4nia_1_8
+4oq9_1_8
+4nia_1_5
+2vrt_1_E
+4nia_1_W
+4oq9_1_6
+4oq8_1_D
+4nia_1_Z
+4oq9_1_W
+4oq9_1_5
+4nia_1_2
+2vrt_1_F
+4oq9_1_U
+4oq9_1_Z
+4oq9_1_2
+4oq9_1_3
+1ddl_1_E
+4oq9_1_1
+6rt5_1_A
+6rt5_1_E
+4qu6_1_B
+6lkq_1_T
+6qdw_1_B
+3jbv_1_B
+3jbu_1_B
+2rdo_1_B
+4v48_1_A0
+4v47_1_A0
+6do8_1_B
+6dpi_1_B
+6dp9_1_B
+6dpb_1_B
+6dmn_1_B
+6dpp_1_B
+6dpk_1_B
+6dpd_1_B
+6dot_1_B
+6dok_1_B
+6dp8_1_B
+6dpl_1_B
+6dpg_1_B
+6dou_1_B
+6dpc_1_B
+6do9_1_B
+6dmv_1_B
+6dp4_1_B
+6dpn_1_B
+6doj_1_B
+6dph_1_B
+6dos_1_B
+6doo_1_B
+6dp6_1_B
+6dox_1_B
+6dp5_1_B
+6dol_1_B
+6dp1_1_B
+6doz_1_B
+6dp7_1_B
+6doq_1_B
+6dpa_1_B
+6dom_1_B
+6dog_1_B
+6dop_1_B
+6doh_1_B
+6doa_1_B
+6don_1_B
+6dov_1_B
+6dpo_1_B
+6dod_1_B
+6dob_1_B
+6dow_1_B
+6dpm_1_B
+6dpf_1_B
+6dp3_1_B
+6dp2_1_B
+6dpe_1_B
+6dpj_1_B
+6dor_1_B
+6dof_1_B
+6dp0_1_B
+6doi_1_B
+6doc_1_B
+6doe_1_B
+6n6g_1_D
+6lkq_1_S
+5h5u_1_H
+5lze_1_Y
+5lze_1_V
+5lze_1_X
+3jcj_1_G
+6o7k_1_G
+4v48_1_BA
+4v47_1_BA
+4b3r_1_W
+4b3t_1_W
+4b3s_1_W
+5o2r_1_X
+5kcs_1_1X
+6fti_1_U
+6fti_1_W
+6ftj_1_U
+6ftj_1_W
+6ftg_1_U
+6ftg_1_W
+6ole_1_T
+6om0_1_T
+6oli_1_T
+6om7_1_T
+6olf_1_T
+6w6l_1_T
+6x1b_1_D
+6x1b_1_F
+5f6c_1_C
+6i0t_1_B
+1b2m_1_C
+1b2m_1_D
+1b2m_1_E
+2uxc_1_Y
+4a3g_1_P
+4a3j_1_P
+7k00_1_5
+5mmi_1_Z
+3j9m_1_U
+6nu2_1_U
+6nu3_1_U
+5c0y_1_C
+6n6f_1_D
+4ohy_1_B
+4oi1_1_B
+4oi0_1_B
+6raz_1_Y
+5ipl_1_3
+6utw_1_333
+5ipm_1_3
+5ipn_1_3
+4ylo_1_3
+4yln_1_6
+4ylo_1_6
+4yln_1_3
+4yln_1_9
+5lzf_1_Y
+1n32_1_Z
+5zsl_1_D
+5zsd_1_C
+5zsd_1_D
+5zsl_1_E
+4nku_1_D
+4nku_1_H
+1cwp_1_E
+6qik_1_Y
+6rzz_1_Y
+6ri5_1_Y
+6qt0_1_Y
+6qtz_1_Y
+6t83_1_1B
+6t83_1_3B
+6t83_1_AA
+6t83_1_CA
+6s05_1_Y
+5jcs_1_X
+5fl8_1_X
+3erc_1_G
+6of1_1_1W
+6cae_1_1Y
+6o97_1_1W
+6of1_1_1Y
+6of1_1_2W
+6o97_1_1Y
+6nd6_1_1Y
+6cae_1_1W
+6of1_1_2Y
+6cae_1_2Y
+6nd6_1_1W
+6cae_1_2W
+6o97_1_2Y
+6nd6_1_2Y
+6o97_1_2W
+6nd6_1_2W
+6xz7_1_G
+6gz5_1_BW
+6gz3_1_BW
+1ls2_1_B
+3ep2_1_Y
+3eq3_1_Y
+4v48_1_A6
+2z9q_1_A
+4hot_1_X
+6d2z_1_C
+4tu0_1_F
+4tu0_1_G
+6r9o_1_B
+6is0_1_C
+5lzc_1_X
+5lzb_1_X
+5lzd_1_Y
+5lzc_1_Y
+5lzb_1_Y
+1gsg_1_T
+6zvi_1_D
+6sv4_1_NB
+6sv4_1_NC
+6i7o_1_NB
+5y88_1_X
+3j6x_1_IR
+3j6y_1_IR
+6tb3_1_N
+6tnu_1_N
+2uxb_1_X
+2x1f_1_B
+2x1a_1_B
+3eq3_1_D
+3ep2_1_D
+1eg0_1_M
+3eq4_1_D
+5o1y_1_B
+3jcr_1_H
+6dzi_1_H
+5zeu_1_A
+6mpi_1_W
+5mfx_1_B
+5w0m_1_J
+5bud_1_E
+5w0m_1_I
+5w0m_1_H
+4j7m_1_B
+5bud_1_D
+6a4e_1_B
+6a4e_1_D
+6hxx_1_AA
+6hxx_1_AB
+6hxx_1_AC
+6hxx_1_AD
+6hxx_1_AE
+6hxx_1_AF
+6hxx_1_AG
+6hxx_1_AH
+6hxx_1_AI
+6hxx_1_AJ
+6hxx_1_AK
+6hxx_1_AL
+6hxx_1_AM
+6hxx_1_AN
+6hxx_1_AO
+6hxx_1_AP
+6hxx_1_AQ
+6hxx_1_AR
+6hxx_1_AS
+6hxx_1_AT
+6hxx_1_AU
+6hxx_1_AV
+6hxx_1_AW
+6hxx_1_AX
+6hxx_1_AY
+6hxx_1_AZ
+6hxx_1_BA
+6hxx_1_BB
+6hxx_1_BC
+6hxx_1_BD
+6hxx_1_BE
+6hxx_1_BF
+6hxx_1_BG
+6hxx_1_BH
+6hxx_1_BI
+5odv_1_A
+5odv_1_B
+5odv_1_C
+5odv_1_D
+5odv_1_E
+5odv_1_F
+5odv_1_G
+5odv_1_H
+5odv_1_I
+5odv_1_J
+5odv_1_K
+5odv_1_L
+5odv_1_M
+5odv_1_N
+5odv_1_O
+5odv_1_P
+5odv_1_Q
+5odv_1_R
+5odv_1_S
+5odv_1_T
+5odv_1_U
+5odv_1_V
+5odv_1_W
+5odv_1_X
+6t34_1_A
+6t34_1_B
+6t34_1_C
+6t34_1_D
+6t34_1_E
+6t34_1_F
+6t34_1_G
+6t34_1_H
+6t34_1_I
+6t34_1_J
+6t34_1_K
+6t34_1_L
+6t34_1_M
+6t34_1_N
+6t34_1_O
+6t34_1_P
+6t34_1_Q
+6t34_1_R
+6t34_1_S
+6ip8_1_ZY
+6ip5_1_ZY
+6ip5_1_ZU
+6ip6_1_ZY
+6ip8_1_ZZ
+6ip6_1_ZZ
+6uu3_1_333
+6uu1_1_333
+1pn8_1_D
+3er8_1_H
+3er8_1_G
+3er8_1_F
+5o3j_1_B
+4dr7_1_B
+1i5l_1_Y
+1i5l_1_U
+4dr6_1_B
+6i2n_1_U
+4v68_1_A0
+6vyu_1_Y
+6vyw_1_Y
+6vz7_1_Y
+6vz5_1_Y
+6vz3_1_Y
+6vyy_1_Y
+6vyx_1_Y
+6vyz_1_Y
+6vz2_1_Y
+1mvr_1_1
+6vyt_1_Y
+1cgm_1_I
+3jb7_1_T
+3jb7_1_M
+3j0o_1_D
+3j0l_1_D
+3j0q_1_D
+3j0p_1_D
+5elt_1_F
+5elt_1_E
+2tmv_1_R
+5a79_1_R
+5a7a_1_R
+2om3_1_R
+2xea_1_R
+4wtl_1_T
+4wtl_1_P
+1xnq_1_W
+1x18_1_C
+1x18_1_B
+1x18_1_D
+1vq6_1_4
+4am3_1_D
+4am3_1_H
+4am3_1_I
+4lj0_1_C
+4lj0_1_D
+4lj0_1_E
+5lzy_1_HH
+4wtj_1_T
+4wtj_1_P
+4xbf_1_D
+6ow3_1_I
+6ovy_1_I
+6oy6_1_I
+6n6d_1_D
+6n6k_1_C
+6n6k_1_D
+3rtj_1_D
+1apg_1_D
+6ty9_1_M
+6tz1_1_N
+4bbl_1_Y
+4bbl_1_Z
+6sce_1_B
+6scf_1_I
+6scf_1_K
+6yud_1_K
+6yud_1_O
+6scf_1_M
+6yud_1_P
+6scf_1_L
+6yud_1_M
+6yud_1_Q
+6o6x_1_D
+4ba2_1_R
+6o6x_1_C
+6o7b_1_C
+6o6v_1_C
+6r7b_1_D
+6r9r_1_D
+6ov0_1_E
+6ov0_1_H
+6ov0_1_G
+6o6v_1_D
+6ov0_1_F
+6o7b_1_D
+5e02_1_C
+6r9r_1_E
+6r7b_1_E
+6o7i_1_I
+6o7h_1_K
+7jyy_1_F
+7jyy_1_E
+7jz0_1_F
+7jz0_1_E
+6rt6_1_A
+6rt6_1_E
+1y1y_1_P
+5zuu_1_I
+5zuu_1_G
+4peh_1_W
+4peh_1_V
+4peh_1_X
+4peh_1_Y
+4peh_1_Z
+6mkn_1_W
+4cxg_1_C
+4cxh_1_C
+1x1l_1_A
+1zc8_1_Z
+2ob7_1_D
+2ob7_1_A
+4eya_1_E
+4eya_1_F
+4eya_1_Q
+4eya_1_R
+2r1g_1_B
+4ht9_1_E
+1cvj_1_M
+6z1p_1_AB
+6z1p_1_AA
+4ii9_1_C
+5mq0_1_3
+5uk4_1_X
+5uk4_1_V
+5uk4_1_W
+5uk4_1_U
+5f6c_1_E
+4rcj_1_B
+1xnr_1_W
+6e0o_1_C
+6o75_1_D
+6o75_1_C
+6e0o_1_B
+3j06_1_R
+1r2x_1_C
+1r2w_1_C
+1eg0_1_L
+4eya_1_G
+4eya_1_H
+4eya_1_S
+4eya_1_T
+4dr4_1_V
+1ibl_1_Z
+1ibm_1_Z
+4dr5_1_V
+4d61_1_J
+1trj_1_B
+1trj_1_C
+6q8y_1_N
+6sv4_1_N
+6i7o_1_N
+5k8h_1_A
+5z4a_1_B
+3jbu_1_V
+1h2c_1_R
+1h2d_1_S
+1h2d_1_R
+6szs_1_X
+5mgp_1_X
+6enu_1_X
+6enf_1_X
+6enj_1_X
+1pvo_1_L
+1pvo_1_G
+1pvo_1_H
+1pvo_1_J
+1pvo_1_K
+2ht1_1_K
+2ht1_1_J
+6eri_1_AX
+1zc8_1_A
+1zc8_1_C
+1zc8_1_B
+1zc8_1_G
+1zc8_1_I
+1zc8_1_H
+1zc8_1_J
+4v8z_1_CX
+6kqe_1_I
+5uh8_1_I
+5vi5_1_Q
+4xln_1_T
+4xlr_1_T
+4xln_1_Q
+5i2d_1_K
+5i2d_1_V
+4xlr_1_Q
+6sty_1_C
+6sty_1_F
+2xs5_1_D
+3ok4_1_N
+3ok4_1_L
+3ok4_1_Z
+3ok4_1_4
+3ok4_1_V
+3ok4_1_X
+3ok4_1_P
+3ok4_1_H
+3ok4_1_J
+3ok4_1_R
+3ok4_1_T
+3ok4_1_2
+6n6h_1_D
+5wnt_1_B
+3b0u_1_B
+3b0u_1_A
+4x9e_1_G
+4x9e_1_H
+6z1p_1_BB
+6z1p_1_BA
+2uxd_1_X
+4qvd_1_H
+4v7e_1_AB
+3ol9_1_D
+3ol9_1_H
+3ol9_1_L
+3ol9_1_P
+3olb_1_L
+3olb_1_P
+3olb_1_D
+3olb_1_H
+3ol6_1_D
+3ol6_1_H
+3ol6_1_L
+3ol6_1_P
+3ol8_1_D
+3ol8_1_H
+3ol7_1_L
+3ol7_1_P
+3ol7_1_D
+3ol7_1_H
+3ol8_1_L
+3ol8_1_P
+1qzc_1_C
+1qzc_1_A
+6ole_1_V
+6om0_1_V
+6oli_1_V
+6om7_1_V
+6w6l_1_V
+6olf_1_V
+1mvr_1_D
+4wtm_1_T
+4wtm_1_P
+5x70_1_E
+5x70_1_G
+6gz5_1_BV
+6gz4_1_BV
+6gz3_1_BV
+6fti_1_Q
+4v7e_1_AE
+4v7e_1_AD
+4x62_1_B
+4x64_1_B
+4x65_1_B
+1xmq_1_W
+4x66_1_B
+3t1h_1_W
+3t1y_1_W
+1xmo_1_W
+4adx_1_9
+6kr6_1_B
+1zn1_1_B
+6z8k_1_X
+1cvj_1_Q
+4csf_1_U
+4csf_1_Q
+4csf_1_G
+4csf_1_M
+4csf_1_K
+4csf_1_A
+4csf_1_I
+4csf_1_S
+4csf_1_C
+4csf_1_W
+4csf_1_O
+4csf_1_E
+1cvj_1_N
+1cvj_1_O
+1cvj_1_S
+1cvj_1_P
+1cvj_1_T
+1cvj_1_R
+6th6_1_AA
+6skg_1_AA
+6skf_1_AA
+6q8y_1_M
+6i7o_1_M
+6zmw_1_W
+6ybv_1_W
+2fz2_1_D
+2xpj_1_D
+2vrt_1_H
+2vrt_1_G
+1emi_1_B
+6r9m_1_B
+4nia_1_C
+4nia_1_A
+4nia_1_H
+4nia_1_N
+4nia_1_G
+4nia_1_D
+4nia_1_B
+4nia_1_I
+4nia_1_E
+4nia_1_M
+4oq9_1_I
+4oq9_1_G
+4oq9_1_C
+4oq9_1_H
+4oq9_1_N
+4oq9_1_A
+4oq9_1_D
+4oq9_1_E
+4oq9_1_M
+4oq9_1_B
+5uhc_1_I
+1uvn_1_F
+1uvn_1_B
+1uvn_1_D
+3iy9_1_A
+4wtk_1_T
+4wtk_1_P
+1vqn_1_4
+4oav_1_C
+4oav_1_A
+3ep2_1_E
+3eq3_1_E
+3eq4_1_E
+3ep2_1_A
+3eq3_1_A
+3eq4_1_A
+3ep2_1_C
+3eq3_1_C
+3eq4_1_C
+3ep2_1_B
+3eq3_1_B
+3eq4_1_B
+4i67_1_B
+3pgw_1_R
+3pgw_1_N
+3cw1_1_X
+3cw1_1_W
+3cw1_1_V
+5it9_1_I
+6k32_1_T
+6k32_1_P
+5mmj_1_A
+5x8r_1_A
+3j2k_1_3
+3j2k_1_2
+3j2k_1_1
+3j2k_1_0
+3j2k_1_4
+3nvk_1_G
+3nvk_1_S
+2iy3_1_B
+1cwp_1_F
+5z4j_1_B
+5gmf_1_E
+5gmf_1_H
+6e4p_1_J
+5gmf_1_F
+5gmf_1_G
+5gmg_1_D
+5gmg_1_C
+6e4p_1_K
+3ie1_1_E
+3ie1_1_H
+3ie1_1_F
+4dr7_1_V
+3ie1_1_G
+3s4g_1_C
+3s4g_1_B
+2qqp_1_R
+2zde_1_E
+2zde_1_F
+2zde_1_H
+2zde_1_G
+1nb7_1_E
+1nb7_1_F
+4hos_1_X
+3p6y_1_T
+3p6y_1_V
+3p6y_1_U
+3p6y_1_Q
+3p6y_1_W
+5dto_1_B
+4cxh_1_X
+1uvj_1_F
+1uvj_1_D
+1uvj_1_E
+6kqd_1_I
+6kqd_1_S
+5uh5_1_I
+1ytu_1_F
+1ytu_1_D
+4kzz_1_J
+5t2c_1_AN
+4v5z_1_BF
+3j6b_1_E
+4v4f_1_B6
+4v4f_1_A5
+4v4f_1_A3
+4v4f_1_B0
+4v4f_1_B9
+4v4f_1_A2
+4v4f_1_A8
+4v4f_1_A1
+4v4f_1_A9
+4v4f_1_BZ
+4v4f_1_B8
+4v4f_1_B7
+4v4f_1_B5
+4v4f_1_A0
+4v4f_1_A7
+4v4f_1_A4
+4v4f_1_AZ
+4v4f_1_B3
+4v4f_1_B1
+4v4f_1_B4
+4v4f_1_A6
+4v4f_1_B2
+5flx_1_Z
+5zsb_1_C
+5zsb_1_D
+5zsn_1_D
+5zsn_1_E
+3jcr_1_N
+6gfw_1_R
+2vaz_1_A
+1qzc_1_B
+1mvr_1_C
+4v5z_1_BP
+6n6e_1_D
+4g7o_1_I
+4g7o_1_S
+5x22_1_S
+5x22_1_I
+5x21_1_I
+5uh6_1_I
+6l74_1_I
+5uh9_1_I
+2ftc_1_R
+6sag_1_R
+4udv_1_R
+2r1g_1_E
+5zsc_1_D
+5zsc_1_C
+6woy_1_I
+6wox_1_I
+6evj_1_N
+6evj_1_M
+4gkk_1_W
+4v9e_1_AG
+4v9e_1_BM
+4v9e_1_AM
+4v9e_1_AA
+4v9e_1_BA
+4v9e_1_BG
+5lzs_1_II
+6fqr_1_C
+6ha1_1_X
+5kcr_1_1X
+2r1g_1_X
+3m7n_1_Z
+3m85_1_X
+3m85_1_Z
+3m85_1_Y
+1e8s_1_C
+5wnp_1_B
+5wnv_1_B
+5yts_1_B
+1utd_1_6
+1utd_1_Z
+1utd_1_4
+1utd_1_7
+1utd_1_9
+1utd_1_5
+1utd_1_3
+1utd_1_2
+1utd_1_8
+1utd_1_1
+6n6i_1_C
+6n6i_1_D
+6n6a_1_D
+6ij2_1_F
+6ij2_1_G
+6ij2_1_H
+6ij2_1_E
+3u2e_1_D
+3u2e_1_C
+5uef_1_C
+5uef_1_D
+4x4u_1_H
+4afy_1_D
+6oy5_1_I
+6owl_1_B
+6owl_1_C
+4afy_1_C
+4lq3_1_R
+6s0m_1_C
+6gx6_1_B
+4k4s_1_D
+4k4s_1_H
+4k4t_1_H
+4k4t_1_D
+1zn1_1_C
+1zn0_1_C
+1xpu_1_G
+1xpu_1_L
+1xpr_1_L
+1xpu_1_H
+1xpo_1_K
+1xpo_1_J
+1xpu_1_J
+1xpo_1_H
+1xpr_1_J
+1xpu_1_K
+1xpr_1_K
+1xpo_1_M
+1xpo_1_L
+1xpu_1_M
+1xpr_1_M
+1xpo_1_G
+1xpr_1_H
+1xpr_1_G
+6gc5_1_F
+6gc5_1_H
+6gc5_1_G
+4v7e_1_AA
+4v7e_1_AC
+1n1h_1_B
+4ohz_1_B
+6t83_1_6B
+4gv6_1_C
+4gv6_1_B
+4gv3_1_C
+4gv3_1_B
+4gv9_1_E
+6i7o_1_L
+2a8v_1_D
+6qx3_1_G
+2xnr_1_C
+4gkj_1_W
+4v5z_1_BC
+4v5z_1_BB
+4v5z_1_BH
+3j0o_1_F
+3j0l_1_F
+3j0p_1_F
+3j0q_1_F
+3j0o_1_B
+3j0l_1_B
+3j0o_1_C
+3j0l_1_C
+3j0q_1_C
+3j0p_1_C
+3j0o_1_A
+3j0l_1_A
+3j0q_1_A
+3j0p_1_A
+1cwp_1_D
+4v5z_1_BJ
+5sze_1_C
+6wre_1_D
+6i0u_1_B
+5zsa_1_C
+5zsa_1_D
+1n34_1_Z
+3pf5_1_S
+6ppn_1_A
+6ppn_1_I
+6qdw_1_V
+5hk0_1_F
+4qm6_1_D
+4qm6_1_C
+4jzu_1_C
+4jzv_1_C
+5ytv_1_B
+4k4z_1_P
+4k4z_1_D
+4k4x_1_L
+4k4z_1_L
+4k4x_1_D
+4k4z_1_H
+4k4x_1_H
+4k4x_1_P
+1t1m_1_A
+1t1m_1_B
+4a3b_1_P
+4a3m_1_P
+6u6y_1_E
+6u6y_1_G
+6u6y_1_F
+6u6y_1_H
+6qik_1_X
+6rzz_1_X
+6ri5_1_X
+6qt0_1_X
+6qtz_1_X
+6s05_1_X
+6t83_1_BB
+6t83_1_4B
+5fl8_1_Z
+5jcs_1_Z
+5mrc_1_BB
+5mre_1_BB
+5mrf_1_BB
+6gz4_1_BW
+3j46_1_P
+3jcr_1_M
+4e6b_1_A
+4e6b_1_B
+6a6l_1_D
+4v5z_1_BS
+4v8t_1_1
+1uvi_1_D
+1uvi_1_F
+1uvi_1_E
+4m7d_1_P
+4k4u_1_D
+4k4u_1_H
+6rt7_1_E
+6rt7_1_A
+2voo_1_C
+2voo_1_D
+5k78_1_X
+5k78_1_Y
+4ylo_1_9
+4kzy_1_I
+4kzz_1_I
+4kzx_1_I
+5vyc_1_I2
+5vyc_1_I3
+5vyc_1_I5
+5vyc_1_I1
+5vyc_1_I6
+5vyc_1_I4
+6ip8_1_2M
+6ip5_1_2M
+6ip6_1_2M
+6qcs_1_M
+486d_1_G
+2r1g_1_C
+486d_1_F
+4v5z_1_B0
+4nia_1_O
+4nia_1_J
+4nia_1_K
+4nia_1_L
+4nia_1_F
+4oq9_1_K
+4oq9_1_O
+4oq9_1_J
+4oq9_1_F
+4oq9_1_L
+5tbw_1_SR
+6hhq_1_SR
+6zvi_1_H
+6sv4_1_2B
+6sv4_1_2C
+6t83_1_2B
+6t83_1_A
+6i7o_1_2B
+6r9q_1_B
+6v3a_1_SN1
+6v3b_1_SN1
+6v39_1_SN1
+6v3e_1_SN1
+1pn7_1_C
+1mj1_1_Q
+1mj1_1_R
+4dr6_1_V
+6kql_1_I
+4eya_1_M
+4eya_1_N
+4eya_1_A
+4eya_1_B
+2wj8_1_D
+2wj8_1_I
+2wj8_1_L
+2wj8_1_F
+2wj8_1_C
+2wj8_1_Q
+2wj8_1_J
+2wj8_1_P
+2wj8_1_K
+2wj8_1_E
+2wj8_1_T
+2wj8_1_B
+2wj8_1_O
+2wj8_1_N
+2wj8_1_A
+2wj8_1_H
+2wj8_1_R
+2wj8_1_M
+2wj8_1_S
+2wj8_1_G
+4e6b_1_E
+4e6b_1_F
+6p71_1_I
+3pdm_1_R
+5det_1_P
+5els_1_I
+4n2s_1_B
+4yoe_1_E
+3j0o_1_H
+3j0l_1_H
+3j0p_1_H
+3j0q_1_H
+5gxi_1_B
+3iy8_1_A
+6tnu_1_M
+5mc6_1_M
+5mc6_1_N
+4eya_1_O
+4eya_1_P
+4eya_1_C
+4eya_1_D
+6htq_1_V
+6htq_1_W
+6htq_1_U
+6uu6_1_333
+6v3a_1_V
+6v39_1_V
+5a0v_1_F
+3avt_1_T
+6d1v_1_C
+4s2x_1_B
+4s2y_1_B
+5wnu_1_B
+1zc8_1_F
+1vtm_1_R
+4v5z_1_BA
+4v5z_1_BE
+4v5z_1_BD
+4v5z_1_BG
+4v5z_1_BI
+4v5z_1_BK
+4v5z_1_BM
+4v5z_1_BL
+4v5z_1_BV
+4v5z_1_BO
+4v5z_1_BN
+4v5z_1_BQ
+4v5z_1_BR
+4v5z_1_BT
+4v5z_1_BU
+4v5z_1_BW
+4v5z_1_BY
+4v5z_1_BX
+4v5z_1_BZ
+6u9x_1_H
+6u9x_1_K
+5elk_1_R
+6okk_1_G
+4cxg_1_A
+4cxh_1_A
+6bk8_1_I
+4cxg_1_B
+4cxh_1_B
+4v5z_1_B1
+5z4d_1_B
+6o78_1_E
+6ha8_1_X
+1m8w_1_E
+1m8w_1_F
+5udi_1_B
+5udl_1_B
+5udk_1_B
+5udj_1_B
+5w5i_1_B
+5w5i_1_D
+5w5h_1_B
+5w5h_1_D
+4eya_1_K
+4eya_1_L
+4eya_1_I
+4eya_1_J
+4g9z_1_E
+4g9z_1_F
+3nma_1_B
+3nma_1_C
+6een_1_G
+6een_1_I
+6een_1_H
+4wti_1_T
+4wti_1_P
+5l3p_1_Y
+4hor_1_X
+3rzo_1_R
+2f4v_1_Z
+1qln_1_R
+2xs7_1_B
+6zvi_1_E
+6sv4_1_MC
+6sv4_1_MB
+6i7o_1_MB
+6ogy_1_M
+6ogy_1_N
+6uej_1_B
+1x18_1_A
+5ytx_1_B
+6o8w_1_U
+4g0a_1_H
+6r9p_1_B
+3koa_1_C
+4n48_1_D
+4n48_1_G
+6kug_1_B
+6ktc_1_V
+6ole_1_U
+6om0_1_U
+6olg_1_BV
+6oli_1_U
+6om7_1_U
+6w6l_1_U
+6olz_1_BV
+6olf_1_U
+5lzd_1_X
+6m7k_1_B
+3cd6_1_4
+3cma_1_5
+6n9e_1_2W
+1vqo_1_4
+1qvg_1_3
+3cme_1_5
+5lzd_1_W
+5lze_1_W
+5lzc_1_W
+5lzb_1_W
+3wzi_1_C
+1mvr_1_E
+1mvr_1_B
+1mvr_1_A
+4adx_1_0
+4adx_1_8
+1n33_1_Z
+6dti_1_W
+3d2s_1_F
+3d2s_1_H
+5mrc_1_AA
+5mre_1_AA
+5mrf_1_AA
+5fl8_1_Y
+5jcs_1_Y
+2r1g_1_A
+2r1g_1_D
+2r1g_1_F
+3eq4_1_Y
+4wkr_1_C
+4v99_1_EC
+4v99_1_AC
+4v99_1_BH
+4v99_1_CH
+4v99_1_AM
+4v99_1_DC
+4v99_1_JW
+4v99_1_EH
+4v99_1_BW
+4v99_1_FW
+4v99_1_AW
+4v99_1_BC
+4v99_1_BM
+4v99_1_IC
+4v99_1_EM
+4v99_1_ER
+4v99_1_IW
+4v99_1_JH
+4v99_1_JR
+4v99_1_AH
+4v99_1_GR
+4v99_1_IR
+4v99_1_BR
+4v99_1_CW
+4v99_1_HR
+4v99_1_FH
+4v99_1_HC
+4v99_1_DW
+4v99_1_GC
+4v99_1_JC
+4v99_1_DM
+4v99_1_EW
+4v99_1_AR
+4v99_1_CR
+4v99_1_JM
+4v99_1_CC
+4v99_1_IH
+4v99_1_FR
+4v99_1_CM
+4v99_1_IM
+4v99_1_FM
+4v99_1_FC
+4v99_1_GH
+4v99_1_HM
+4v99_1_HH
+4v99_1_DR
+4v99_1_HW
+4v99_1_GW
+4v99_1_DH
+4v99_1_GM
+6rt4_1_D
+6rt4_1_C
+6zvh_1_X
+4dwa_1_D
+6n6c_1_D
+6n6j_1_C
+6n6j_1_D
+6p7q_1_E
+6p7q_1_F
+6p7q_1_D
+6rcl_1_C
+5jju_1_C
+4ejt_1_G
+5ceu_1_C
+5ceu_1_D
+6lkq_1_W
+3qsu_1_P
+3qsu_1_R
+1n38_1_B
+4qvc_1_G
+6q1h_1_D
+6q1h_1_H
+6p7p_1_F
+6p7p_1_E
+6p7p_1_D
+6vm6_1_J
+6vm6_1_G
+6wan_1_K
+6wan_1_H
+6wan_1_G
+6wan_1_L
+6wan_1_I
+6ywo_1_F
+6wan_1_J
+4oau_1_A
+6ywo_1_E
+6ywo_1_K
+6vm6_1_I
+6vm6_1_H
+6ywo_1_I
+2a1r_1_C
+2a1r_1_D
+3gpq_1_E
+3gpq_1_F
+6o79_1_C
+6vm6_1_K
+6hyu_1_D
+1laj_1_R
+6ybv_1_K
+6mpf_1_W
+6spc_1_A
+6spe_1_A
+6fti_1_V
+6ftj_1_V
+6ftg_1_V
+4g0a_1_G
+4g0a_1_F
+4g0a_1_E
+2b2d_1_S
+5hkc_1_C
+1rmv_1_B
+4qu7_1_X
+4qu7_1_V
+4qu7_1_U
+4v5z_1_AH
+4v5z_1_AA
+4v5z_1_AB
+4v5z_1_AC
+4v5z_1_AD
+4v5z_1_AE
+4v5z_1_AF
+4v5z_1_AG
+6pmi_1_3
+6pmj_1_3
+5hjz_1_C
--- a/known_issues_reasons.txt
View file @7196427
+++ b/known_issues_reasons.txt
View file @7196427
--- a/regression.py
View file @7196427
+++ b/regression.py
View file @7196427
@@ -11,7 +11,7 @@
 #   - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
 #   - Use cmalign --small everywhere (homogeneity)
 # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. 
-# To date, we trust Infernal as the best tool to realign RNA. Is it ?
+# To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
 # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
 LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"]   # From Rfam CLAN 00112
 SSU_set = ["RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"]  # From Rfam CLAN 00111
-with sqlite3.connect("results/RNANet.db") as conn:
+with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
     df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
 to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
 ax.set_zlabel("Computation time (s)")
 plt.subplots_adjust(wspace=0.4)
-plt.savefig("results/cmalign_jobs_performance.png")
+plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
 # # ========================================================
 # # Linear Regression of max_mem as function of max_length
--- a/statistics.py
View file @7196427
+++ b/statistics.py
View file @7196427
@@ -3,7 +3,6 @@
 # This file computes additional statistics over the produced dataset.
 # Run this file if you want the base counts, pair-type counts, identity percents, etc
 # in the database.
-# This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
 import getopt, os, pickle, sqlite3, shlex, subprocess, sys
 import numpy as np
@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
 from collections import Counter
-from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
+from setproctitle import setproctitle
+from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
 path_to_3D_data = "tobedefinedbyoptions"
 path_to_seq_data = "tobedefinedbyoptions"
+runDir = os.getcwd()
 res_thr = 20.0 # default: all structures
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
-def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
+@trace_unhandled_exceptions
+def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
-    See Wadley & Pyle (2007)
+    See Wadley & Pyle (2007).
+    Only unique unmapped chains with resolution < res argument are considered.
     Arguments:
-    show: True or False, call plt.show() at this end or not
-    filter_helical: None, "form", "zone", or "both"
-                    None: do not remove helical nucleotide
-                    "form": remove nucleotides if they belong to a A, B or Z form stem
-                    "zone": remove nucleotides falling in an arbitrary zone (see zone argument)
-                    "both": remove nucleotides fulfilling one or both of the above conditions
     carbon:     1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
+    show:       True or False, call plt.show() at this end or not
     sd_range:   tuple, set values below avg + sd_range[0] * stdev to 0,
                     and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
                     This removes noise and cuts too high peaks, to clearly see the clusters.
+    res:        Minimal resolution (maximal resolution value, actually) of the structure to 
+                    consider its nucleotides.
     """
-    os.makedirs("results/figures/wadley_plots/", exist_ok=True)
+    os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True)
     if carbon == 4:
         angle = "eta"
@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         exit("You overestimate my capabilities !")
-    if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
+    if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"):
         # Get a worker number to position the progress bar
         global idxQueue
         thr_idx = idxQueue.get()
+        setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
+
         pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
-        with sqlite3.connect("results/RNANet.db") as conn:
+        with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
-                                 FROM nucleotide JOIN (
+                                 FROM (
-                                    SELECT chain_id FROM chain JOIN structure
+                                    SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                    WHERE structure.resolution <= {res}
+                                    WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
-                                 ) AS c
+                                 ) AS c NATURAL JOIN nucleotide
                                  WHERE puckering="C2'-endo" 
                                     AND {angle} IS NOT NULL 
                                     AND th{angle} IS NOT NULL;""", conn)
             c2_endo_etas = df[angle].values.tolist()
             c2_endo_thetas = df["th"+angle].values.tolist()
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
-                                 FROM nucleotide JOIN (
+                                 FROM (
-                                    SELECT chain_id FROM chain JOIN structure
+                                    SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                    WHERE structure.resolution <= {res}
+                                    WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
-                                 ) AS c
+                                 ) AS c NATURAL JOIN nucleotide 
                                  WHERE form = '.' 
                                     AND puckering="C3'-endo" 
                                     AND {angle} IS NOT NULL 
@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         pbar.update(1)
         # Save the data to an archive for later use without the need to recompute
-        np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
+        np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
         pbar.close()
         idxQueue.put(thr_idx)
     else:
-        f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
+        setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})")
+
+        f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz")
         c2_endo_etas = f["c2_endo_e"]
         c3_endo_etas = f["c3_endo_e"]
         c2_endo_thetas = f["c2_endo_t"]
@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         f_low_thr = f.mean() + sd_range[0]*f.std()
         f_cut = np.where(f > f_sup_thr, f_sup_thr, f)
         f_cut = np.where(f_cut < f_low_thr, 0, f_cut)
-        levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
+        levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
         # histogram:
         fig = plt.figure()
@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-        fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
+        fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-        fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
+        fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         fig = plt.figure(figsize=(5,5))
         ax = fig.gca()
         ax.scatter(x, y, s=1, alpha=0.1)
-        ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
+        ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-        fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
+        fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
 def stats_len():
     """Plots statistics on chain lengths in RNA families.
+    Uses all chains mapped to a family including copies, inferred or not.
     REQUIRES tables chain, nucleotide up to date.
     """
+    setproctitle(f"RNANet statistics.py stats_len({res_thr})")
+    
     # Get a worker number to position the progress bar
     global idxQueue
     thr_idx = idxQueue.get()
@@ -214,7 +221,7 @@ def stats_len():
     cols = []
     lengths = []
-    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
+    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -229,7 +236,7 @@ def stats_len():
             cols.append("grey")
         # Get the lengths of chains
-        with sqlite3.connect("results/RNANet.db") as conn:
+        with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 
                                                             FROM (
                                                                 SELECT chain_id 
@@ -239,8 +246,6 @@ def stats_len():
                                                             GROUP BY chain_id;""", warn_every=0) ]
         lengths.append(l) # list of chain lengths from the family
-        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
-
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
     ax = fig.gca()
@@ -267,7 +272,7 @@ def stats_len():
                 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
     # Save the figure
-    fig.savefig(f"results/figures/lengths_{res_thr}A.png")
+    fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
     idxQueue.put(thr_idx) # replace the thread index in the queue
     # notify("Computed sequence length statistics and saved the figure.")
@@ -285,6 +290,7 @@ def format_percentage(tot, x):
 def stats_freq():
     """Computes base frequencies in all RNA families.
+    Uses all chains mapped to a family including copies, inferred or not.
     Outputs results/frequencies.csv
     REQUIRES tables chain, nucleotide up to date."""
@@ -293,17 +299,18 @@ def stats_freq():
     global idxQueue
     thr_idx = idxQueue.get()
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()")
+
     # Initialize a Counter object for each family
     freqs = {}
     for f in fam_list:
         freqs[f] = Counter()
     # List all nt_names happening within a RNA family and store the counts in the Counter
-    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
+    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
-        with sqlite3.connect("results/RNANet.db") as conn:
+        with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
-        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
@@ -311,7 +318,7 @@ def stats_freq():
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
-    df.to_csv("results/frequencies.csv")    
+    df.to_csv(runDir + "/results/frequencies.csv")    
     idxQueue.put(thr_idx) # replace the thread index in the queue
     # notify("Saved nucleotide frequencies to CSV file.")
@@ -327,11 +334,13 @@ def parallel_stats_pairs(f):
     global idxQueue
     thr_idx = idxQueue.get()
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})")
+
     chain_id_list = mappings_list[f]
     data = []
     sqldata = []
     for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
-        with sqlite3.connect("results/RNANet.db") as conn:
+        with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
             interactions = pd.DataFrame(
                             sql_ask_database(conn, 
@@ -398,7 +407,7 @@ def parallel_stats_pairs(f):
         data.append(expanded_list)
     # Update the database
-    with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+    with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
         conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
         sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
                                 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
@@ -416,8 +425,8 @@ def parallel_stats_pairs(f):
     # Create an output DataFrame
     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
-    f_df.to_csv(f"data/{f}_counts.csv")
+    f_df.to_csv(runDir + f"/data/{f}_counts.csv")
-    expanded_list.to_csv(f"data/{f}_pairs.csv")
+    expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
     idxQueue.put(thr_idx) # replace the thread index in the queue
@@ -430,28 +439,34 @@ def to_dist_matrix(f):
     global idxQueue
     thr_idx = idxQueue.get()
-    # notify(f"Computing {f} distance matrix from alignment...")
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
-    command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
     # Prepare a file
     with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
         al = AlignIO.read(al_file, "fasta")
         names = [ x.id for x in al if '[' in x.id ]
         al = al[-len(names):]
-    with open(f + "_3d_only.stk", "w") as only_3d:
+    with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
+        try:
             only_3d.write(al.format("stockholm"))
+        except ValueError as e:
+            warn(e)
     del al
+    subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm",  path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
     # Prepare the job
-    process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
+    process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 
+                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     id_matrix = np.zeros((len(names), len(names)))
     pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
-    while process.poll() is None:
+    cnt = 0
-        output = process.stdout.readline()
+    while not cnt or process.poll() is None:
+        output = process.stdout.read()
         if output:
             lines = output.strip().split(b'\n')
             for l in lines:
+                cnt += 1
                 line = l.split()
                 s1 = line[0].decode('utf-8')
                 s2 = line[1].decode('utf-8')
@@ -460,9 +475,14 @@ def to_dist_matrix(f):
                 id2 = names.index(s2)
                 id_matrix[id1, id2] = float(score)
                 pbar.update(1)
+    if cnt != len(names)*(len(names)-1)*0.5:
+        warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}")
+    if process.poll() != 0:
+        l = process.stderr.read().strip().split(b'\n')
+        warn("\n".join([ line.decode('utf-8') for line in l ]))
     pbar.close()
-    subprocess.run(["rm", "-f", f + "_3d_only.stk"])
+    subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
     np.save("data/"+f+".npy", id_matrix)
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
@@ -471,21 +491,26 @@ def seq_idty():
     """Computes identity matrices for each of the RNA families.
     REQUIRES temporary results files in data/*.npy
-    REQUIRES tables chain, family un to date."""
+    REQUIRES tables chain, family up to date."""
     # load distance matrices
+    fams_to_plot = [ f for f in famlist if f not in ignored ]
     fam_arrays = []
-    for f in famlist:
+    for f in fams_to_plot:
         if path.isfile("data/"+f+".npy"):
-            fam_arrays.append(np.load("data/"+f+".npy"))
+            fam_arrays.append(np.load("data/"+f+".npy") / 100.0)  # normalize percentages in [0,1]
         else:
-            fam_arrays.append([])
+            warn("data/"+f+".npy not found !")
+            fam_arrays.append(np.array([]))
     # Update database with identity percentages
-    conn = sqlite3.connect("results/RNANet.db")
+    conn = sqlite3.connect(runDir + "/results/RNANet.db")
-    for f, D in zip(famlist, fam_arrays):
+    for f, D in zip(fams_to_plot, fam_arrays):
         if not len(D): continue
-        a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
+        if D.shape[0] > 1:
+            a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1)    # SUM(D) / (n(n-1)/2)
+        else:
+            a = D[0][0]
         conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
     conn.commit()
     conn.close()
@@ -495,10 +520,11 @@ def seq_idty():
     axs = axs.ravel()
     [axi.set_axis_off() for axi in axs]
     im = "" # Just to declare the variable, it will be set in the loop
-    for f, D, ax in zip(famlist, fam_arrays, axs):
+    for f, D, ax in zip(fams_to_plot, fam_arrays, axs):
-        if not len(D): continue
-        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
         D = D + D.T         # Copy the lower triangle to upper, to get a symetrical matrix
+        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+            D = 1.0 - D
+            np.fill_diagonal(D, 0.0)
             condensedD = squareform(D)
             # Compute basic dendrogram by Ward's method
@@ -507,15 +533,20 @@ def seq_idty():
             # Reorganize rows and cols
             idx1 = Z['leaves']
-            D = D[idx1,:]
+            D = D[idx1[::-1],:]
             D = D[:,idx1[::-1]]
-        im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
+            D = 1.0 - D
-        ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
+        elif D.shape[0] == 2:
+            np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now
+        ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)", 
+                fontsize=9, horizontalalignment = 'center', verticalalignment='bottom')
+        im = ax.matshow(D, vmin=0, vmax=1)
+
     fig.tight_layout()
-    fig.subplots_adjust(wspace=0.1, hspace=0.3)
+    fig.subplots_adjust(hspace=0.3, wspace=0.1)
-    fig.colorbar(im, ax=axs[-1], shrink=0.8)
+    fig.colorbar(im, ax=axs[-4], shrink=0.8)
-    fig.savefig(f"results/figures/distances.png")
+    fig.savefig(runDir + f"/results/figures/distances.png")
-    notify("Computed all identity matrices and saved the figure.")
+    print("> Computed all identity matrices and saved the figure.", flush=True)
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -523,6 +554,8 @@ def stats_pairs():
     Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
     REQUIRES tables chain, nucleotide up-to-date.""" 
+    setproctitle(f"RNANet statistics.py stats_pairs()")
+
     def line_format(family_data):
         return family_data.apply(partial(format_percentage, sum(family_data)))
@@ -530,12 +563,12 @@ def stats_pairs():
         results = []
         allpairs = []
         for f in fam_list:
-            newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
+            newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
-            fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
+            fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
             results.append(fam_df)
             allpairs.append(newpairs)
-            subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
+            subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"])
-            subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
+            subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
         df.to_csv("data/pair_counts.csv")
@@ -573,14 +606,14 @@ def stats_pairs():
     crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
     # Save to CSV
-    df.to_csv("results/pair_types.csv")
+    df.to_csv(runDir + "/results/pair_types.csv")
     # Plot barplot of overall types
     ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
     ax.set_ylabel("Number of observations (millions)", fontsize=13)
     ax.set_xlabel(None)
     plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
-    plt.savefig("results/figures/pairings.png")
+    plt.savefig(runDir + "/results/figures/pairings.png")
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
@@ -589,7 +622,9 @@ def per_chain_stats():
     REQUIRES tables chain, nucleotide up to date. """
-    with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+    setproctitle(f"RNANet statistics.py per_chain_stats()")
+
+    with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -600,25 +635,36 @@ def per_chain_stats():
         conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
-    notify("Updated the database with per-chain base frequencies")
+    print("> Updated the database with per-chain base frequencies", flush=True)
 def general_stats():
     """
     Number of structures as function of the resolution threshold
     Number of Rfam families as function of the resolution threshold
     """
-    with sqlite3.connect("results/RNANet.db") as conn:
+
-        df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+    setproctitle(f"RNANet statistics.py general_stats()")
+
+    reqs = [
+        # unique unmapped chains with no issues
+        """ SELECT distinct pdb_id, chain_name, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                        WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
+            WHERE rfam_acc = 'unmappd' AND ISSUE=0;""",
-        df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+
+        # unique mapped chains with no issues
+        """ SELECT distinct pdb_id, chain_name, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
+            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
-        df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
+
+        # mapped chains with no issues
+        """ SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
+            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
-        df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
+
-                                                    FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
+        # mapped chains with no issues that are all inferred
+        """ SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
+            FROM (
+                SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
                 FROM chain 
                 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 
                         FROM chain 
@@ -627,8 +673,105 @@ def general_stats():
                 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 
                 WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
             ) AS c
-                                                    JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
+            JOIN structure ON c.structure_id=structure.pdb_id;""",
-    print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
+
+        # Number of mapped chains (not inferred)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
+
+        # Number of unique mapped chains (not inferred)
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
+
+        # Number of mapped chains (inferred)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
+
+        # Number of unique mapped chains (inferred)
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
+
+        # Number of mapped chains inferred once
+        """SELECT count(*) FROM (
+                SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c 
+                FROM chain where rfam_acc!='unmappd' and inferred=1 
+                GROUP BY structure_id, chain_name
+            ) WHERE c=1;""",
+
+        # Number of mapped chains inferred twice
+        """select count(*) from (
+                select structure_id, chain_name, count(distinct rfam_acc) as c 
+                from chain where rfam_acc!='unmappd' and inferred=1 
+                group by structure_id, chain_name
+            ) where c=2;""",
+
+        # Number of mapped chains inferred 3 times or more
+        """select count(*) from (
+                select structure_id, chain_name, count(distinct rfam_acc) as c 
+                from chain where rfam_acc!='unmappd' and inferred=1 
+                group by structure_id, chain_name
+            ) where c>2;""",
+
+        # Number of chains both mapped with and without inferrence
+        """ SELECT COUNT(*) FROM (
+                SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c 
+                FROM chain 
+                WHERE rfam_acc!='unmappd' 
+                GROUP BY structure_id, chain_name
+            ) 
+            WHERE s < c AND s > 0;""",
+        
+        # Number of mapped chains (total)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
+
+        # Number of unique mapped chains
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
+
+        # Number of unmapped chains
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""",
+        
+        # Number of mapped chains without issues (not inferred)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
+
+        # Number of unique mapped chains without issues (not inferred)
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
+
+        # Number of mapped chains without issues (inferred)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
+
+        # Number of unique mapped chains without issues (inferred)
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
+
+        # Number of mapped chains without issues (total)
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
+
+        # Number of unique mapped chains without issues
+        """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
+
+        # Number of unmapped chains without issues
+        """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);"""
+    ]
+
+    answers = []
+    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+        for r in reqs:
+            answers.append(pd.read_sql(r, conn))
+    df_unique = answers[0]
+    df_mapped_unique = answers[1]
+    df_mapped_copies = answers[2]
+    df_inferred_only_unique = answers[3]
+    print()
+    print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
+    if answers[4].iloc[0][0] != answers[5].iloc[0][0]:
+        print("\t> This happens because different parts of the same chain can be mapped to different families.")
+    print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
+    print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,")
+    print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,")
+    print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.")
+    print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.")
+    if answers[11].iloc[0][0]:
+        print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
+    print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
+    print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
+    if answers[14].iloc[0][0]:
+        print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
+    print()
     ##########################################
     # plot N = f(resolution, exp_method)
@@ -642,7 +785,7 @@ def general_stats():
     df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
     df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
     max_res = max(df_unique.resolution)
-    max_structs = len(df_mapped_copies.index.tolist())
+    max_structs = max(len(df_mapped_copies.index), len(df_unique.index))
     colors = np.linspace(0,1,1+len(methods))
     plt.xticks( np.arange(0, max_res+2, 2.0).tolist(),  np.arange(0, max_res+2, 2.0).tolist() )
@@ -654,7 +797,7 @@ def general_stats():
     axs[0][0].set_ylabel("ALL", fontsize=14)
     axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
     axs[0][0].set_ylim((0, max_structs * 1.05))
-    axs[0][0].legend(loc="best", fontsize=14)
+    axs[0][0].legend(loc="lower right", fontsize=14)
     axs[0][1].grid(axis='y', ls='dotted', lw=1)
     axs[0][1].set_yticklabels([])
@@ -663,9 +806,9 @@ def general_stats():
     axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
     axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " %  len(df_mapped_unique.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
-    axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
+    axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
     axs[0][1].set_ylim((0, max_structs * 1.05))
-    axs[0][1].legend(loc="best", fontsize=14)
+    axs[0][1].legend(loc="upper left", fontsize=14)
     axs[0][2].grid(axis='y', ls='dotted', lw=1)
     axs[0][2].set_yticklabels([])
@@ -675,7 +818,7 @@ def general_stats():
     axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " %  len(df_mapped_copies.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
     axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
-    axs[0][2].legend(loc="right", fontsize=14)
+    axs[0][2].legend(loc="upper left", fontsize=14)
     axs[0][2].set_ylim((0, max_structs * 1.05))
     for i,m in enumerate(methods):
@@ -683,7 +826,7 @@ def general_stats():
         df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
         df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
         df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
-        max_structs = len(df_mapped_copies_m.resolution.tolist())
+        max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index))
         print("> found", max_structs, "structures with method", m, flush=True)
         axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
@@ -693,7 +836,7 @@ def general_stats():
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][0].set_ylim((0, max_structs * 1.05))
         axs[1+i][0].set_ylabel(m, fontsize=14)
-        axs[1+i][0].legend(loc="best", fontsize=14)
+        axs[1+i][0].legend(loc="lower right", fontsize=14)
         axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
         axs[1+i][1].set_yticklabels([])
@@ -703,7 +846,7 @@ def general_stats():
         axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " %  len(df_mapped_unique_m.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][1].set_ylim((0, max_structs * 1.05))
-        axs[1+i][1].legend(loc="best", fontsize=14)
+        axs[1+i][1].legend(loc="upper left", fontsize=14)
         axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
         axs[1+i][2].set_yticklabels([])
@@ -713,7 +856,7 @@ def general_stats():
         axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " %  len(df_mapped_copies_m.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][2].set_ylim((0, max_structs * 1.05))
-        axs[1+i][2].legend(loc="right", fontsize=14)
+        axs[1+i][2].legend(loc="upper left", fontsize=14)
     axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
     axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
@@ -722,7 +865,7 @@ def general_stats():
     fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
     fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, 
                         hspace=0.05, bottom=0.05, top=0.92)
-    fig.savefig("results/figures/resolutions.png")
+    fig.savefig(runDir + "/results/figures/resolutions.png")
     plt.close()
     ##########################################
@@ -765,7 +908,7 @@ def general_stats():
     fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
     fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, 
                         hspace=0.05, bottom=0.12, top=0.84)
-    fig.savefig("results/figures/Nfamilies.png")
+    fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
 def log_to_pbar(pbar):
@@ -776,8 +919,10 @@ def log_to_pbar(pbar):
 if __name__ == "__main__":
     # parse options
+    DELETE_OLD_DATA = False
+    DO_WADLEY_ANALYSIS = False
     try:
-        opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
+        opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
     except getopt.GetoptError as err:
         print(err)
         sys.exit(2)
@@ -795,6 +940,7 @@ if __name__ == "__main__":
                     "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
             print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
                     "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+            print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
             sys.exit()
         elif opt == '--version':
             print("RNANet statistics 1.1 beta")
@@ -810,25 +956,37 @@ if __name__ == "__main__":
             path_to_seq_data = path.abspath(arg)
             if path_to_seq_data[-1] != '/':
                 path_to_seq_data += '/'
+        elif opt=='--from-scratch':
+            DELETE_OLD_DATA = True
+            DO_WADLEY_ANALYSIS = True
+            subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
+        elif opt=='--wadley':
+            DO_WADLEY_ANALYSIS = True
     # Load mappings
     print("Loading mappings list...")
-    with sqlite3.connect("results/RNANet.db") as conn:
+    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
         mappings_list = {}
         for k in fam_list:
-            mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
+            mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
     # List the families for which we will compute sequence identity matrices
-    with sqlite3.connect("results/RNANet.db") as conn:
+    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-        famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
+        famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
-        ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
+        ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
+        n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
     if len(ignored):
         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
+    if DELETE_OLD_DATA:
+        for f in fam_list:
+            subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
+
+
     # Prepare the multiprocessing execution environment
-    nworkers = max(read_cpu_number()-1, 32)
+    nworkers = min(read_cpu_number()-1, 32)
     thr_idx_mgr = Manager()
     idxQueue = thr_idx_mgr.Queue()
     for i in range(nworkers):
@@ -836,14 +994,15 @@ if __name__ == "__main__":
     # Define the tasks
     joblist = []
-    # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0)))   # res threshold is 4.0 Angstroms by default
+    if n_unmapped_chains and DO_WADLEY_ANALYSIS:
-    # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0)))   #
+        joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0)))   # res threshold is 4.0 Angstroms by default
+        joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0)))   #
     joblist.append(Job(function=stats_len)) # Computes figures
-    # joblist.append(Job(function=stats_freq)) # updates the database
+    joblist.append(Job(function=stats_freq)) # updates the database
-    # for f in famlist:
+    for f in famlist:
-    #     joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
+        joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
-    #     if f not in ignored:
+        if f not in ignored:
-    #         joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
+            joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
@@ -867,7 +1026,8 @@ if __name__ == "__main__":
     print()
     # finish the work after the parallel portions
-    # per_chain_stats()
+    per_chain_stats()
-    # seq_idty()
+    seq_idty()
-    # stats_pairs()
+    stats_pairs()
+    if n_unmapped_chains:
         general_stats()