Revision 1 for Bioinformatics completed

Louis BECQUEY
Commit 7196427d1340f4cf45c6878c26e440a111e7e55d 7196427d 1 parent d266c643
Showing 7 changed files with 2255 additions and 527 deletions
.gitignore
README.md
RNAnet.py
known_issues.txt
known_issues_reasons.txt
regression.py
statistics.py
--- a/.gitignore
View file @7196427
+++ b/.gitignore
View file @7196427
@@ -13,3 +13,4 @@ esl*
 # environment stuff
 .vscode/
 *.pyc
+ __pycache__/
\ No newline at end of file
--- a/README.md
View file @7196427
+++ b/README.md
View file @7196427
@@ -94,6 +94,8 @@ The detailed list of options is below:
 -h [ --help ]                   Print this help message
 --version                       Print the program version
 
+ -f [ --full-inference ]         Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
+                                 mapped to different families.
 -r 4.0 [ --resolution=4.0 ]     Maximum 3D structure resolution to consider a RNA chain.
 -s                              Run statistics computations after completion
 --extract                       Extract the portions of 3D RNA chains to individual mmCIF files.
@@ -105,7 +107,7 @@ The detailed list of options is below:
                                         RNAcifs/                Full structures containing RNA, in mmCIF format
                                         rna_mapped_to_Rfam/     Extracted 'pure' RNA chains
                                         datapoints/             Final results in CSV file format.
- --seq-folder=…                  Path to a folder to store the sequence and alignment files.
+ --seq-folder=…                  Path to a folder to store the sequence and alignment files. Subfolders will be:
                                         rfam_sequences/fasta/   Compressed hits to Rfam families
                                         realigned/              Sequences, covariance models, and alignments by family
 --no-homology                   Do not try to compute PSSMs and do not align sequences.
@@ -117,11 +119,12 @@ The detailed list of options is below:
 --update-homologous             Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
 --from-scratch                  Delete database, local 3D and sequence files, and known issues, and recompute.
 --archive                       Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
+ --no-logs                       Do not save per-chain logs of the numbering modifications
 ```
 
 Typical usage:
 ```
- nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &
+ nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
 ```
 
 ## Post-computation task: estimate quality
--- a/RNAnet.py
View file @7196427
+++ b/RNAnet.py
View file @7196427
 #!/usr/bin/python3.8
+ import Bio
+ import concurrent.futures
+ import getopt
+ import gzip
+ import io
+ import json
 import numpy as np
+ import os
 import pandas as pd
- import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
- from Bio import AlignIO, SeqIO
- from Bio.PDB import MMCIFParser
- from Bio.PDB.mmcifio import MMCIFIO
- from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
- from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning
- from Bio.PDB.Dice import ChainSelector
- from Bio.Alphabet import generic_rna
- from Bio.Seq import Seq
- from Bio.SeqRecord import SeqRecord
- from Bio.Align import MultipleSeqAlignment, AlignInfo
- from collections import OrderedDict, defaultdict
+ import pickle
+ import psutil
+ import re
+ import requests
+ import signal
+ import sqlalchemy
+ import sqlite3
+ import subprocess
+ import sys
+ import time
+ import traceback
+ import warnings
 from functools import partial, wraps
- from os import path, makedirs
- from multiprocessing import Pool, Manager, set_start_method
+ from multiprocessing import Pool, Manager
 from time import sleep
 from tqdm import tqdm
 from setproctitle import setproctitle
 
+ 
 def trace_unhandled_exceptions(func):
     @wraps(func)
     def wrapped_func(*args, **kwargs):
@@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func):
             print(s)
     return wrapped_func
 
+ 
 pd.set_option('display.max_rows', None)
 sqlite3.enable_callback_tracebacks(True)
 sqlite3.register_adapter(np.int64, lambda val: int(val))        # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
- sqlite3.register_adapter(np.float64, lambda val: float(val))    # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
+ sqlite3.register_adapter(np.float64, lambda val: float(val))    # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
 
 m = Manager()
 running_stats = m.list()
@@ -52,11 +60,14 @@ validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
 
- LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"}   # From Rfam CLAN 00112
- SSU_set = {"RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"}  # From Rfam CLAN 00111
+ LSU_set = {"RF00002", "RF02540", "RF02541",
+            "RF02543", "RF02546"}   # From Rfam CLAN 00112
+ SSU_set = {"RF00177", "RF02542",  "RF02545",
+            "RF01959", "RF01960"}  # From Rfam CLAN 00111
 no_nts_set = set()
 weird_mappings = set()
 
+ 
 class SelectivePortionSelector(object):
     """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
 
@@ -101,7 +112,7 @@ class SelectivePortionSelector(object):
         return 1
 
 
- class BufferingSummaryInfo(AlignInfo.SummaryInfo):
+ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
 
     def get_pssm(self, family, index):
         """Create a position specific score matrix object for the alignment. 
@@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
                         score_dict[this_residue] = 1.0
             pssm_info.append(('*', score_dict))
 
-         return AlignInfo.PSSM(pssm_info)
+         return Bio.Align.AlignInfo.PSSM(pssm_info)
 
 
 class Chain:
@@ -187,11 +198,11 @@ class Chain:
 
         with warnings.catch_warnings():
             # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
-             warnings.simplefilter('ignore', PDBConstructionWarning)  
-             warnings.simplefilter('ignore', BiopythonWarning)  
+             warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
+             warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
 
             # Load the whole mmCIF into a Biopython structure object:
-             mmcif_parser = MMCIFParser()
+             mmcif_parser = Bio.PDB.MMCIFParser()
             try:
                 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
             except ValueError as e:
@@ -212,7 +223,7 @@ class Chain:
             sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
 
             # Save that selection on the mmCIF object s to file
-             ioobj = MMCIFIO()
+             ioobj = Bio.PDB.mmcifio.MMCIFIO()
             ioobj.set_structure(s)
             ioobj.save(self.file, sel)
 
@@ -253,7 +264,7 @@ class Chain:
             # Create the Pandas DataFrame for the nucleotides of the right chain
             nts = json_object["nts"]                        # sub-json-object
             df = pd.DataFrame(nts)                          # conversion to dataframe
-             df = df[ df.chain_name == self.pdb_chain_id ]    # keeping only this chain's nucleotides
+             df = df[df.chain_name == self.pdb_chain_id]     # keeping only this chain's nucleotides
 
             # Assert nucleotides of the chain are found
             if df.empty:
@@ -266,12 +277,12 @@ class Chain:
             # Remove low pertinence or undocumented descriptors, convert angles values
             cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
                             "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base",
-                 "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering" ]
+                             "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
             df = df[cols_we_keep]
-             df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
-                         'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] *= np.pi/180.0
-             df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi]
-                         'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi)
+             df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4',  # Conversion to radians
+                        'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0
+             df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4',  # mapping [-pi, pi] into [0, 2pi]
+                        'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi)
 
         except KeyError as e:
             warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
@@ -295,14 +306,14 @@ class Chain:
         # Duplicate residue numbers : shift numbering
         while True in df.duplicated(['nt_resnum']).values:
             i = df.duplicated(['nt_resnum']).values.tolist().index(True)
-             duplicates = df[df.nt_resnum == df.iloc[i,1]]
+             duplicates = df[df.nt_resnum == df.iloc[i, 1]]
             n_dup = len(duplicates.nt_resnum)
             index_last_dup = duplicates.index_chain.iloc[-1] - 1
             if self.mapping is not None:
                 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
 
             try:
-                 if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+                 if i > 0 and index_last_dup + 1 < len(df.index) and df.iloc[i, 1] == df.iloc[i-1, 1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
                     # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
 
                     if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
@@ -314,15 +325,15 @@ class Chain:
                     else:
                         # We solve the problem continous component by continuous component
                         for j in range(1, n_dup+1):
-                             if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
-                                 df.iloc[i+j-1,1] += 1
+                             if duplicates.iloc[j, 0] == 1 + duplicates.iloc[j-1, 0]:  # continuous
+                                 df.iloc[i+j-1, 1] += 1
                             else:
                                 break
-                 elif df.iloc[i,1] == df.iloc[i-1,1]:
+                 elif df.iloc[i, 1] == df.iloc[i-1, 1]:
                     # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
                     # Solution : we shift the numbering of 17A (to 18) and the following residues.
                     df.iloc[i:, 1] += 1
-                 elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3:
+                 elif duplicates.iloc[0, 0] == 1 and df.iloc[i, 0] == 3:
                     # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
                     df.iloc[1:, 1] += 1
                     df.iloc[0, 1] = 0
@@ -340,12 +351,16 @@ class Chain:
 
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
-         while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
-                         (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
-                         or (df.iloc[[-1]].puckering=='').any()
+         while ( 
+                 len(df.index_chain) and df.iloc[-1, 2] not in ["A", "C", "G", "U"] 
+                 and (
+                     (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon",
+                                     "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
+                     or (df.iloc[[-1]].puckering == '').any()
                 )
-                 or  (   len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1]    ) # large nt_resnum gap between the two last residues
-                 or  (   len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"]   )
+                 # large nt_resnum gap between the two last residues
+                 or (len(df.index_chain) >= 2 and df.iloc[-1, 1] > 50 + df.iloc[-2, 1])
+                 or (len(df.index_chain) and df.iloc[-1, 2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"])
         ):
             if self.mapping is not None:
                 self.mapping.log("Droping ligand:")
@@ -390,17 +405,19 @@ class Chain:
                         break
                 if found:
                     self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
-                     df_row = pd.DataFrame([found], index=[i])[df.columns.values]
-                     df_row.iloc[0,0] = i+1          # index_chain
-                     df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum
-                     df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
+                     df_row = pd.DataFrame([found], index=[i])[
+                         df.columns.values]
+                     df_row.iloc[0, 0] = i+1          # index_chain
+                     df_row.iloc[0, 1] = df.iloc[i, 1]  # nt_resnum
+                     df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]])
                     df.iloc[i+1:, 1] += 1
                 else:
                     warn(f"Missing index_chain {i} in {self.chain_label} !")
 
         # Assert some nucleotides still exist
         try:
-             l = df.iloc[-1,1] - df.iloc[0,1] + 1    # update length of chain from nt_resnum point of view
+             # update length of chain from nt_resnum point of view
+             l = df.iloc[-1, 1] - df.iloc[0, 1] + 1
         except IndexError:
             warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
                  f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
@@ -426,14 +443,17 @@ class Chain:
         # index_chain            1 |-------------|77 83|------------|  154
         # expected data point    1 |--------------------------------|  154
         #
+         
         if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
-             resnum_start = df.iloc[0,1]
-             diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)     # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
+             resnum_start = df.iloc[0, 1]
+             # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
+             diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
             for i in sorted(diff):
                 # Add a row at position i
-                 df = pd.concat([    df.iloc[:i], 
-                                     pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]), 
-                                     df.iloc[i:]       ])
+                 df = pd.concat([df.iloc[:i],
+                                 pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start,
+                                               "nt_id": "not resolved", "nt_code": '-', "nt_name": '-'}, index=[i]),
+                                 df.iloc[i:]])
                 # Increase the index_chain of all following lines
                 df.iloc[i+1:, 0] += 1
             df = df.reset_index(drop=True)
@@ -444,27 +464,27 @@ class Chain:
         #######################################
 
         # Add a sequence column just for the alignments
-         df['nt_align_code'] = [ str(x).upper()
+         df['nt_align_code'] = [str(x).upper()
                                .replace('NAN', '-') # Unresolved nucleotides are gaps
                                .replace('?', '-')   # Unidentified residues, let's delete them
                                .replace('T', 'U')   # 5MU are modified to t, which gives T
                                .replace('P', 'U')   # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
-                                 for x in df['nt_code'] ]
+                                for x in df['nt_code']]
 
         # One-hot encoding sequence
-         df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
-         df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
-         df["is_G"] = [ 1 if x=="G" else 0 for x in df["nt_code"] ]
-         df["is_U"] = [ 1 if x=="U" else 0 for x in df["nt_code"] ]
-         df["is_other"] = [ 0 if x in "ACGU" else 1 for x in df["nt_code"] ]
+         df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]]
+         df["is_C"] = [1 if x == "C" else 0 for x in df["nt_code"]]
+         df["is_G"] = [1 if x == "G" else 0 for x in df["nt_code"]]
+         df["is_U"] = [1 if x == "U" else 0 for x in df["nt_code"]]
+         df["is_other"] = [0 if x in "ACGU" else 1 for x in df["nt_code"]]
         df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ]
 
         # Iterate over pairs to identify base-base interactions
         res_ids = list(df['nt_id'])  # things like "chainID.C4, chainID.U5"
-         paired = [ '' ] * self.full_length
-         pair_type_LW = [ '' ] * self.full_length
-         pair_type_DSSR = [ '' ] * self.full_length
-         interacts = [ 0 ] * self.full_length
+         paired = [''] * self.full_length
+         pair_type_LW = [''] * self.full_length
+         pair_type_DSSR = [''] * self.full_length
+         interacts = [0] * self.full_length
         if "pairs" in json_object.keys():
             pairs = json_object["pairs"]
             for p in pairs:
@@ -506,17 +526,19 @@ class Chain:
                         paired[nt2_idx] += ',' + str(nt1_idx + 1)
 
         # transform nt_id to shorter values
-         df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','').replace('/','') for n, name in zip(df.nt_id, df.nt_name) ]
+         df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ]
 
         df['paired'] = paired
         df['pair_type_LW'] = pair_type_LW
         df['pair_type_DSSR'] = pair_type_DSSR
         df['nb_interact'] = interacts
-         df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors
+ 
+         # remove now useless descriptors
+         df = df.drop(['nt_id', 'nt_resnum'], axis=1)
 
         self.seq = "".join(df.nt_code)
         self.seq_to_align = "".join(df.nt_align_code)
-         self.length = len([ x for x in self.seq_to_align if x != "-" ])
+         self.length = len([x for x in self.seq_to_align if x != "-"])
 
         # Remove too short chains
         if self.length < 5:
@@ -559,7 +581,8 @@ class Chain:
                                                                 WHERE structure_id='{self.pdb_id}' 
                                                                 AND chain_name='{self.pdb_chain_id}' 
                                                                 AND rfam_acc='{self.mapping.rfam_acc}'
-                                                     AND eq_class='{self.eq_class}';""")[0][0]
+                                                                 AND eq_class='{self.eq_class}';"""
+                                                     )[0][0]
             else:
                 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) 
                                    ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
@@ -568,19 +591,18 @@ class Chain:
                                                                 WHERE structure_id='{self.pdb_id}' 
                                                                 AND chain_name='{self.pdb_chain_id}' 
                                                                 AND eq_class='{self.eq_class}'
-                                                     AND rfam_acc = 'unmappd';""")[0][0]
+                                                                 AND rfam_acc = 'unmappd';"""
+                                                     )[0][0]
 
             # Add the nucleotides if the chain is not an issue
             if df is not None and not self.delete_me:   # double condition is theoretically redundant here, but you never know
-                 sql_execute(conn, f"""
-                 INSERT OR IGNORE INTO nucleotide 
+                 sql_execute(conn, f"""INSERT OR IGNORE INTO nucleotide 
                                         (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
                                         epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                                         v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, 
                                         old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact)
                                         VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
-                     ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
-                     ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 
+                                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
                             many=True, data=list(df.to_records(index=False)), warn_every=10)
 
     def remap(self, columns_to_save, s_seq):
@@ -598,40 +620,39 @@ class Chain:
         # Save colums in the appropriate positions
         i = 0
         j = 0
-         while i<self.full_length and j<alilen:
+         while i < self.full_length and j < alilen:
             # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
             # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
 
             if self.seq_to_align[i] == s_seq[j].upper():    # alignment and sequence correspond (incl. gaps)
-                 re_mappings.append( (self.db_chain_id, i+1, j+1) ) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
+                 re_mappings.append((self.db_chain_id, i+1, j+1))    # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
                 columns_to_save.add(j+1)    # it's a set, doublons are automaticaly ignored
                 i += 1
                 j += 1
             elif self.seq_to_align[i] == '-':   # gap in the chain, but not in the aligned sequence
- 
                 # search for a gap to the consensus nearby
                 k = 0  # Search must start at zero to assert the difference comes from '-' in front of '.'
-                 while j+k<alilen and s_seq[j+k] == '.':
+                 while j+k < alilen and s_seq[j+k] == '.':
                     k += 1
 
                 # if found, set j to that position
-                 if j+k<alilen and s_seq[j+k] == '-':
-                     re_mappings.append( (self.db_chain_id, i+1, j+k+1) )
+                 if j+k < alilen and s_seq[j+k] == '-':
+                     re_mappings.append((self.db_chain_id, i+1, j+k+1))
                     columns_to_save.add(j+k+1)
                     i += 1
                     j += k+1
                     continue
 
                 # if not, take the insertion gap if this is one
-                 if j<alilen and s_seq[j] == '.':
-                     re_mappings.append( (self.db_chain_id, i+1, j+1) )
+                 if j < alilen and s_seq[j] == '.':
+                     re_mappings.append((self.db_chain_id, i+1, j+1))
                     columns_to_save.add(j+1)
                     i += 1
                     j += 1
                     continue
 
                 # else, just mark the gap as unknown (there is an alignment mismatch)
-                 re_mappings.append( (self.db_chain_id, i+1, 0) )
+                 re_mappings.append((self.db_chain_id, i+1, 0))
                 i += 1
             elif s_seq[j] in ['.', '-']:  # gap in the alignment, but not in the real chain
                 j += 1  # ignore the column
@@ -672,7 +693,7 @@ class Chain:
                 l = letters[freq.index(max(freq))]
                 c_seq_to_align[i] = l
                 c_seq[i] = l
-                 gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 ))
+                 gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1))
         self.seq_to_align = ''.join(c_seq_to_align)
         self.seq = ''.join(c_seq)
         return gaps
@@ -684,6 +705,7 @@ class Job:
     This could be a system command or the execution of a Python function.
     Time and memory usage of a job can be monitored.
     """
+ 
     def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""):
         self.cmd_ = command             # A system command to run
         self.func_ = function           # A python function to run
@@ -709,7 +731,8 @@ class Job:
         if self.func_ is None:
             s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_)
         else:
-             s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")"
+             s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" \
+                 + " ".join([ str(a) for a in self.args_ ]) + ")"
         return s
 
 
@@ -767,13 +790,14 @@ class Downloader:
         print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
         try:
             db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
-             mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', con=db_connection)
+             mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;',
+                 con=db_connection)
             mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv")
             print(f"\t{validsymb}")
         except sqlalchemy.exc.OperationalError:  # Cannot connect :'(
             print(f"\t{errsymb}")
             # Check if a previous run succeeded (if file exists, use it)
-             if path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
+             if os.path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
                 print("\t> Using previous version.")
                 mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv")
             else:  # otherwise, abort.
@@ -791,7 +815,7 @@ class Downloader:
         setproctitle(f"RNANet.py download_Rfam_cm()")
 
         print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
-         if not path.isfile(path_to_seq_data + "Rfam.cm"):
+         if not os.path.isfile(path_to_seq_data + "Rfam.cm"):
             try:
                 subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"])
                 print(f"\t{validsymb}", flush=True)
@@ -815,7 +839,6 @@ class Downloader:
         try:
             db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
 
- 
             # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family.
             q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM
                     (SELECT fr.rfam_acc, MAX(    
@@ -838,15 +861,17 @@ class Downloader:
             d = pd.read_sql(q, con=db_connection)
 
             # filter the results to families we are interested in
-             d = d[ d["rfam_acc"].isin(list_of_families) ]
+             d = d[d["rfam_acc"].isin(list_of_families)]
 
             print(d)
 
             with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn:
-                 sql_execute(conn, """
-                     INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
-                     VALUES (?, ?, ?);""", many=True, data=list(d.to_records(index=False))
-                 ) # We use the replace keyword to get the latest information
+                 # We use the REPLACE keyword to get the latest information
+                 sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
+                                      VALUES (?, ?, ?);""", 
+                             many=True, 
+                             data=list(d.to_records(index=False))
+                             )  
 
         except sqlalchemy.exc.OperationalError:
             warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.")
@@ -858,10 +883,11 @@ class Downloader:
 
         setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
 
-         if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
+         if not os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
             for _ in range(10):  # retry 100 times if it fails
                 try:
-                     subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL)
+                     subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O",
+                                     path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                     notify(f"Downloaded {rfam_acc}.fa.gz from Rfam")
                     return          # if it worked, no need to retry
                 except Exception as e:
@@ -881,8 +907,9 @@ class Downloader:
 
         setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
 
-         nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) 
+         nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
         print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
+ 
         # Download latest BGSU non-redundant list
         try:
             s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content
@@ -894,13 +921,13 @@ class Downloader:
             warn("Error downloading NR list !\t", error=True)
 
             # Try to read previous file
-             if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
-                 print("\t> Use of the previous version.\t", end = "", flush=True)
+             if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
+                 print("\t> Use of the previous version.\t", end="", flush=True)
             else:
                 return pd.DataFrame([], columns=["class", "class_members"])
 
         nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
-         full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
+         full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
         print(f"\t{validsymb}", flush=True)
 
         # The beginning of an adventure.
@@ -910,14 +937,15 @@ class Downloader:
 
         setproctitle(f"RNANet.py download_from_SILVA({unit})")
 
- 
-         if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
+         if not os.path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
             try:
                 print(f"Downloading {unit} from SILVA...", end='', flush=True)
-                 if unit=="LSU":
-                     subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
+                 if unit == "LSU":
+                     subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz",
+                                     "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
                 else:
-                     subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
+                     subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz",
+                                     "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
             except:
                 warn(f"Error downloading the {unit} database from SILVA", error=True)
                 exit(1)
@@ -949,7 +977,8 @@ class Mapping:
 
     def filter_df(self, df):
 
-         newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
+         newdf = df.drop(df[(df.nt_resnum < self.nt_start) |
+                            (df.nt_resnum > self.nt_end)].index)
 
         if len(newdf.index_chain) > 0:
             # everything's okay
@@ -961,19 +990,20 @@ class Mapping:
             # index_chain and not nt_resnum.
             warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
             weird_mappings.add(self.chain_label + "." + self.rfam_acc)
-             df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
+             df = df.drop(df[(df.index_chain < self.nt_start) |
+                             (df.index_chain > self.nt_end)].index)
 
         # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
         self.st = 0
-         if len(df.index_chain) and df.iloc[0,0] != 1:
-             self.st = df.iloc[0,0] -1
+         if len(df.index_chain) and df.iloc[0, 0] != 1:
+             self.st = df.iloc[0, 0] - 1
             df.iloc[:, 0] -= self.st
             self.log(f"Shifting index_chain of {self.st}")
 
         # Check that some residues are not included by mistake:
         # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
         # because the icode are not read by DSSR.
-         toremove = df[ df.index_chain > self.nt_end ]
+         toremove = df[df.index_chain > self.nt_end]
         if not toremove.empty:
             df = df.drop(toremove.index)
             self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:")
@@ -991,9 +1021,9 @@ class Mapping:
         if self.logs == []:
             return  # Do not create a log file if there is nothing to log
 
-         if not path.exists("logs"):
-             os.makedirs("logs", exist_ok=True)
-         with open("logs/"+filename, "w") as f:
+         if not os.path.exists(runDir+"/logs"):
+             os.makedirs(runDir+"/logs", exist_ok=True)
+         with open(runDir+"/logs/"+filename, "w") as f:
             f.writelines(self.logs)
 
 
@@ -1019,20 +1049,23 @@ class Pipeline:
         self.SELECT_ONLY = None
         self.ARCHIVE = False
         self.SAVELOGS = True
+         self.FULLINFERENCE = False
 
     def process_options(self):
-         """Sets the paths and options of the pipeline"""
+         """Sets the paths and options of the pipeline
+         """
+         
         global path_to_3D_data
         global path_to_seq_data
 
         setproctitle("RNANet.py process_options()")
 
         try:
-             opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 
-                                     [   "help", "resolution=", "keep-hetatm=", "from-scratch",
+             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
+                                     ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
                                         "fill-gaps=", "3d-folder=", "seq-folder=",
                                         "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
-                                         "archive", "update-homologous" ])
+                                         "archive", "update-homologous"])
         except getopt.GetoptError as err:
             print(err)
             sys.exit(2)
@@ -1044,13 +1077,15 @@ class Pipeline:
                 exit()
 
             if opt == "-h" or opt == "--help":
-                 print(  "RNANet, a script to build a multiscale RNA dataset from public data\n"
+                 print("RNANet, a script to build a multiscale RNA dataset from public data\n"
                       "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
                 print()
                 print("Options:")
                 print("-h [ --help ]\t\t\tPrint this help message")
                 print("--version\t\t\tPrint the program version")
                 print()
+                 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of chains"
+                       "\n\t\t\t\tmapped to different families.")
                 print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.")
                 print("-s\t\t\t\tRun statistics computations after completion")
                 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
@@ -1062,7 +1097,7 @@ class Pipeline:
                       "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
                       "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
                       "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
-                 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files."
+                 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
                       "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
                       "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
                 print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences."
@@ -1077,7 +1112,7 @@ class Pipeline:
                 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
                 print()
                 print("Typical usage:")
-                 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 
+                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
                 sys.exit()
             elif opt == '--version':
                 print("RNANet 1.1 beta")
@@ -1087,21 +1122,21 @@ class Pipeline:
                 self.CRYSTAL_RES = float(arg)
             elif opt == "-s":
                 self.RUN_STATS = True
-             elif opt=="--keep-hetatm":
-                 assert arg in [ "True", "False" ]
+             elif opt == "--keep-hetatm":
+                 assert arg in ["True", "False"]
                 self.KEEP_HETATM = (arg == "True")
-             elif opt=="--fill-gaps":
-                 assert arg in [ "True", "False" ]
+             elif opt == "--fill-gaps":
+                 assert arg in ["True", "False"]
                 self.FILL_GAPS = (arg == "True")
-             elif opt=="--no-homology":
+             elif opt == "--no-homology":
                 self.HOMOLOGY = False
-             elif opt=='--3d-folder':
-                 path_to_3D_data = path.abspath(arg)
+             elif opt == '--3d-folder':
+                 path_to_3D_data = os.path.abspath(arg)
                 if path_to_3D_data[-1] != '/':
                     path_to_3D_data += '/'
                 print("> Storing 3D data into", path_to_3D_data)
-             elif opt=='--seq-folder':
-                 path_to_seq_data = path.abspath(arg)
+             elif opt == '--seq-folder':
+                 path_to_seq_data = os.path.abspath(arg)
                 if path_to_seq_data[-1] != '/':
                     path_to_seq_data += '/'
                 print("> Storing sequences into", path_to_seq_data)
@@ -1138,6 +1173,8 @@ class Pipeline:
                 self.ARCHIVE = True
             elif opt == "--no-logs":
                 self.SAVELOGS = False
+             elif opt == "-f" or opt == "--full-inference":
+                 self.FULLINFERENCE = True
 
         if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
@@ -1149,17 +1186,19 @@ class Pipeline:
         """List 3D chains with available Rfam mappings.
 
         Return a list of Chain() objects with the mappings set up.        
-         If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
+         If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.
+         """
 
         setproctitle("RNANet.py list_available_mappings()")
 
         # List all 3D RNA chains below given resolution
-         full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
+         full_structures_list = self.dl.download_BGSU_NR_list(
+             self.CRYSTAL_RES)  # list of tuples ( class, class_members )
 
         # Check for a list of known problems:
-         if path.isfile(runDir + "/known_issues.txt"):
+         if os.path.isfile(runDir + "/known_issues.txt"):
             with open(runDir + "/known_issues.txt", 'r') as issues:
-                 self.known_issues = [ x[:-1] for x in issues.readlines() ]
+                 self.known_issues = [x[:-1] for x in issues.readlines()]
             if self.USE_KNOWN_ISSUES:
                 print("\t> Ignoring known issues:")
                 for x in self.known_issues:
@@ -1175,9 +1214,18 @@ class Pipeline:
             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
             try:
 
-                 pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
-                 for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): 
+                 pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1,
+                             desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
+                 for _, newchains in enumerate(p.imap_unordered(partial(
+                                                                     work_infer_mappings, 
+                                                                     not self.REUSE_ALL, 
+                                                                     allmappings, 
+                                                                     self.FULLINFERENCE
+                                                               ), 
+                                                               full_structures_list, 
+                                                               chunksize=1)):
                     self.update += newchains
+                     
                     pbar.update(1)  # Everytime the iteration finishes, update the global progress bar
 
                 pbar.close()
@@ -1192,7 +1240,7 @@ class Pipeline:
         else:
             conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
             for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"):
-                 codes = codelist.replace('+',',').split(',')
+                 codes = codelist.replace('+', ',').split(',')
 
                 # Simply convert the list of codes to Chain() objects
                 for c in codes:
@@ -1201,40 +1249,48 @@ class Pipeline:
                     pdb_model = int(nr[1])
                     pdb_chain_id = nr[2].upper()
                     chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
-                     res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
+                     res = sql_ask_database(conn, f"""SELECT chain_id from chain 
+                                                         WHERE structure_id='{pdb_id}' 
+                                                         AND chain_name='{pdb_chain_id}' 
+                                                         AND rfam_acc = 'unmappd' 
+                                                         AND issue=0""")
                     if not len(res) or self.REUSE_ALL:  # the chain is NOT yet in the database, or this is a known issue
                         self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
             conn.close()
 
         if self.SELECT_ONLY is not None:
-             self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ]
+             self.update = [
+                 c for c in self.update if c.chain_label == self.SELECT_ONLY]
 
         self.n_chains = len(self.update)
         print(str(self.n_chains) + " RNA chains of interest.")
 
     @trace_unhandled_exceptions
-     def dl_and_annotate(self, retry=False, coeff_ncores = 0.75):
+     def dl_and_annotate(self, retry=False, coeff_ncores=0.75):
         """
         Gets mmCIF files from the PDB, and runs DSSR on them.
         Ignores a structure if the file already exists (not if we are retrying).
 
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
-         SETS table structure"""
+         SETS table structure
+         """
 
-         # setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
+         setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
 
         # Prepare the results folders
-         if not path.isdir(path_to_3D_data + "RNAcifs"):
-             os.makedirs(path_to_3D_data + "RNAcifs")        # for the whole structures
-         if not path.isdir(path_to_3D_data + "annotations"):
-             os.makedirs(path_to_3D_data + "annotations")    # for DSSR analysis of the whole structures
+         if not os.path.isdir(path_to_3D_data + "RNAcifs"):
+             # for the whole structures
+             os.makedirs(path_to_3D_data + "RNAcifs")
+         if not os.path.isdir(path_to_3D_data + "annotations"):
+             # for DSSR analysis of the whole structures
+             os.makedirs(path_to_3D_data + "annotations")
 
         # Download and annotate
         print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
         if retry:
-             mmcif_list = sorted(set([ c.pdb_id for c in self.retry ]))
+             mmcif_list = sorted(set([c.pdb_id for c in self.retry]))
         else:
-             mmcif_list = sorted(set([ c.pdb_id for c in self.update ]))
+             mmcif_list = sorted(set([c.pdb_id for c in self.update]))
         try:
             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
             pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
@@ -1255,16 +1311,19 @@ class Pipeline:
         and extract their informations from the JSON files to the database.
 
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
-         SETS self.loaded_chains"""
+         SETS self.loaded_chains
+         """
 
         setproctitle(f"RNANet.py build_chains(retry={retry})")
 
         # Prepare folders
         if self.EXTRACT_CHAINS:
-             if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
-                 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
-             if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
-                 os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
+             if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
+                 # for the portions mapped to Rfam
+                 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
+             if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
+                 # extract chains of pure RNA
+                 os.makedirs(path_to_3D_data + "rna_only")
 
         # define and run jobs
         joblist = []
@@ -1296,44 +1355,48 @@ class Pipeline:
                     issues += 1
                     issues_names.append(c[1].chain_label)
                     ki.write(c[1].chain_label + '\n')
-                     kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n')
+                     kir.write(c[1].chain_label + '\n' +
+                               c[1].error_messages + '\n\n')
                     with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
                         sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
         ki.close()
         kir.close()
         if issues:
             warn(f"Added {issues} newly discovered issues to known issues:")
-             print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True)
+             print("\033[33m" + " ".join(issues_names) + "\033[0m", flush=True)
 
         # Add successfully built chains to list
-         self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ]
+         self.loaded_chains += [c[1] for c in results if not c[1].delete_me]
 
         # Identify errors due to empty JSON files (this happen when RAM is full, we believe).
         # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage.
         self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ]
 
     def checkpoint_save_chains(self):
-         """Saves self.loaded_chains to data/loaded_chains.picke"""
-         with open(runDir + "/data/loaded_chains.pickle","wb") as pick:
+         """Saves self.loaded_chains to data/loaded_chains.picke
+         """
+         with open(runDir + "/data/loaded_chains.pickle", "wb") as pick:
             pickle.dump(self.loaded_chains, pick)
 
     def checkpoint_load_chains(self):
-         """Load self.loaded_chains from data/loaded_chains.pickle"""
-         with open(runDir + "/data/loaded_chains.pickle","rb") as pick:
+         """Load self.loaded_chains from data/loaded_chains.pickle
+         """
+         with open(runDir + "/data/loaded_chains.pickle", "rb") as pick:
             self.loaded_chains = pickle.load(pick)
 
     def prepare_sequences(self):
         """Downloads homologous sequences and covariance models required to compute MSAs.
 
         REQUIRES that self.loaded_chains is defined.
-         SETS family (partially, through call)"""
+         SETS family (partially, through call)
+         """
 
         setproctitle("RNANet.py prepare_sequences()")
 
         # Preparing a results folder
         if not os.access(path_to_seq_data + "realigned/", os.F_OK):
             os.makedirs(path_to_seq_data + "realigned/")
-         if not path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
+         if not os.path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
             os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True)
 
         # Update the family table (rfam_acc, description, max_len)
@@ -1344,7 +1407,8 @@ class Pipeline:
 
         joblist = []
         for f in self.fam_list:
-             joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[self.dl, f, rfam_acc_to_download[f]]))
+             joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
+                            self.dl, f, rfam_acc_to_download[f]]))
         try:
             execute_joblist(joblist)
 
@@ -1360,14 +1424,16 @@ class Pipeline:
         """Perform multiple sequence alignments.
 
         REQUIRES self.fam_list to be defined
-         SETS family (partially)"""
+         SETS family (partially)
+         """
 
         setproctitle("RNANet.py realign()")
 
         # Prepare the job list
         joblist = []
         for f in self.fam_list:
-             joblist.append( Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))  # the function already uses all CPUs so launch them one by one
+             # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
+             joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
 
         # Execute the jobs
         try:
@@ -1379,8 +1445,8 @@ class Pipeline:
         # Update the database
         data = []
         for r in results:
-             align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
-             nb_3d_chains = len([ 1 for r in align if '[' in r.id ])
+             align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
+             nb_3d_chains = len([1 for r in align if '[' in r.id])
             if r[0] in SSU_set:  # SSU v138 is used
                 nb_homologs = 2225272       # source: https://www.arb-silva.de/documentation/release-138/
                 nb_total_homol = nb_homologs + nb_3d_chains
@@ -1390,7 +1456,7 @@ class Pipeline:
             else:
                 nb_total_homol = len(align)
                 nb_homologs = nb_total_homol - nb_3d_chains
-             data.append( (nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]) )
+             data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]))
 
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? 
@@ -1399,13 +1465,14 @@ class Pipeline:
     def remap(self):
         """Compute nucleotide frequencies of some alignments and save them in the database
 
-         REQUIRES self.fam_list to be defined"""
+         REQUIRES self.fam_list to be defined
+         """
 
         setproctitle("RNANet.py remap()")
 
         print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
         print("Check your CPU and disk I/O activity before deciding if the job failed.")
-         nworkers =max(min(ncores, len(self.fam_list)), 1)
+         nworkers = max(min(ncores, len(self.fam_list)), 1)
 
         # Prepare the architecture of a shiny multi-progress-bars design
                                                 # Push the number of workers to a queue.
@@ -1419,8 +1486,10 @@ class Pipeline:
 
         try:
             fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
-             for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family
-                 fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
+             # Apply work_pssm to each RNA family
+             for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
+                 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
+                 fam_pbar.update(1)
             fam_pbar.close()
             p.close()
             p.join()
@@ -1434,23 +1503,24 @@ class Pipeline:
     def output_results(self):
         """Produces CSV files, archive them, and additional metadata files
 
-         REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)"""
+         REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)
+         """
 
         setproctitle("RNANet.py output_results()")
 
         time_str = time.strftime("%Y%m%d")
 
-         #Prepare folders:
-         if not path.isdir(path_to_3D_data + "datapoints/"):
+         # Prepare folders:
+         if not os.path.isdir(path_to_3D_data + "datapoints/"):
             os.makedirs(path_to_3D_data + "datapoints/")
-         if not path.isdir(runDir + "/results/archive/"):
+         if not os.path.isdir(runDir + "/results/archive/"):
             os.makedirs(runDir + "/results/archive/")
 
         # Save to by-chain CSV files
         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
         try:
             pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
-             for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)):
+             for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
                 pbar.update(1)
             pbar.close()
             p.close()
@@ -1465,36 +1535,44 @@ class Pipeline:
         # Run statistics
         if self.RUN_STATS:
             # Remove previous precomputed data
-             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
+             subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 
+                                         runDir + "/data/wadley_kernel_eta_prime.npz", 
+                                         runDir + "/data/pair_counts.csv"])
             for f in self.fam_list:
-                 subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
+                 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 
+                                             runDir + f"/data/{f}_pairs.csv", 
+                                             runDir + f"/data/{f}_counts.csv"])
 
             # Run statistics files
-             os.chdir(runDir)
-             subprocess.run(["python3.8", "regression.py"])
-             subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
+             subprocess.run(["python3.8", fileDir+"/regression.py"])
+             subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder",  path_to_3D_data, 
+                             "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
 
         # Save additional informations
         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
-             pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+             pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem 
+                                  FROM family ORDER BY nb_3d_chains DESC;""",
                               conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-             pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+             pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue 
+                                  FROM structure 
                                  JOIN chain ON structure.pdb_id = chain.structure_id
-                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
+                                  ORDER BY structure_id, chain_name, rfam_acc ASC;""",
+                               conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
 
         # Archive the results
-         if self.SELECT_ONLY is None:
-             os.makedirs("results/archive", exist_ok=True)
-             subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
+         if self.ARCHIVE:
+             os.makedirs(runDir + "/results/archive", exist_ok=True)
+             subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf",
+                             runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
 
             # Update shortcuts to latest versions
             subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
                             runDir + "/results/summary_latest.csv",
                             runDir + "/results/families_latest.csv"
                             ])
-         subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
-         subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
-         subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
+             subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
+             subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
+             subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
 
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
@@ -1518,7 +1596,9 @@ class Pipeline:
         if self.HOMOLOGY:
             # check if chains have been re_mapped:
             r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain 
-                                           WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
+                                           WHERE issue = 0 
+                                                 AND rfam_acc != 'unmappd'
+                                                 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
                                           GROUP BY rfam_acc;""")
             try:
                 if len(r) and r[0][0] is not None:
@@ -1545,22 +1625,25 @@ class Pipeline:
 
 
 def read_cpu_number():
-     # As one shall not use os.cpu_count() on LXC containers,
-     # because it reads info from /sys wich is not the VM resources but the host resources.
-     # This function reads it from /proc/cpuinfo instead.
+     """This function reads the number of CPU cores available from /proc/cpuinfo.
+     One shall not use os.cpu_count() on LXC containers,
+     because it reads info from /sys wich is not the VM resources but the host resources.
+     """
     p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
     return int(int(p.stdout.decode('utf-8')[:-1])/2)
 
+ 
 def init_worker(tqdm_lock=None):
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if tqdm_lock is not None:
         tqdm.set_lock(tqdm_lock)
 
+ 
 def warn(message, error=False):
     """Pretty-print warnings and error messages.
     """
     # Cut if too long
-     if len(message)>66:
+     if len(message) > 66:
         x = message.find(' ', 50, 66)
         if x != -1:
             warn(message[:x], error=error)
@@ -1574,11 +1657,13 @@ def warn(message, error=False):
     else:
         print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
 
+ 
 def notify(message, post=''):
     if len(post):
         post = '(' + post + ')'
     print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
 
+ 
 def sql_define_tables(conn):
     conn.executescript(
         """ PRAGMA foreign_keys = on;
@@ -1684,8 +1769,9 @@ def sql_define_tables(conn):
          """)
     conn.commit()
 
+ 
 @trace_unhandled_exceptions
- def sql_ask_database(conn, sql, warn_every = 10):
+ def sql_ask_database(conn, sql, warn_every=10):
     """
     Reads the SQLite database.
     Returns a list of tuples.
@@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10):
             return result         # if it worked, no need to retry
         except sqlite3.OperationalError as e:
             if warn_every and not (_+1) % warn_every:
-                 warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
+                 warn(str(e) + ", retrying in 0.2s (worker " +
+                      str(os.getpid()) + f', try {_+1}/100)')
             time.sleep(0.2)
     warn("Tried to reach database 100 times and failed. Aborting.", error=True)
     return []
 
+ 
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
@@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
             return          # if it worked, no need to retry
         except sqlite3.OperationalError as e:
             if warn_every and not (_+1) % warn_every:
-                 warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
+                 warn(str(e) + ", retrying in 0.2s (worker " +
+                      str(os.getpid()) + f', try {_+1}/100)')
             time.sleep(0.2)
     warn("Tried to reach database 100 times and failed. Aborting.", error=True)
 
+ 
 @trace_unhandled_exceptions
 def execute_job(j, jobcount):
     """Run a Job object.
@@ -1741,7 +1831,8 @@ def execute_job(j, jobcount):
         print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
 
         # Add the command to logfile
-         logfile = open(runDir + "/log_of_the_run.sh", 'a')
+         os.makedirs(runDir+"/logs", exist_ok=True)
+         logfile = open(runDir + "/logs/log_of_the_run.sh", 'a')
         logfile.write(" ".join(j.cmd_))
         logfile.write("\n")
         logfile.close()
@@ -1753,7 +1844,8 @@ def execute_job(j, jobcount):
 
             # run the command. subprocess.run will be a child of this process, and stays monitored.
             start_time = time.time()
-             r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+             r = subprocess.run(j.cmd_, timeout=j.timeout_,
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             end_time = time.time()
 
             # Stop the Monitor, then get its result
@@ -1782,7 +1874,8 @@ def execute_job(j, jobcount):
 
     # return time and memory statistics, plus the job results
     t = end_time - start_time
-     return (t,m,r)
+     return (t, m, r)
+ 
 
 def execute_joblist(fulljoblist):
     """ Run a list of job objects.
@@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist):
 
     # Process the jobs from priority 1 to nprio
     results = []
-     for i in range(1,nprio+1):
-         if i not in jobs.keys(): continue # no job has the priority level i
+     for i in range(1, nprio+1):
+         if i not in jobs.keys():
+             continue  # no job has the priority level i
 
         print("processing jobs of priority", i)
         different_thread_numbers = sorted(jobs[i].keys())
@@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist):
         for n in different_thread_numbers:
             # get the bunch of jobs of same priority and thread number
             bunch = jobs[i][n]
-             if not len(bunch): continue # no jobs should be processed n by n
+             if not len(bunch):
+                 continue  # no jobs should be processed n by n
 
             print("using", n, "processes:")
             # execute jobs of priority i that should be processed n by n:
@@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist):
             for j, r in zip(bunch, raw_results):
                 j.comp_time = round(r[0], 2)  # seconds
                 j.max_mem = int(r[1]/1000000)  # MB
-                 results.append( (j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
+                 results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
 
     # throw back the money
     return results
 
+ 
 @trace_unhandled_exceptions
- def work_infer_mappings(update_only, allmappings, codelist) -> list:
+ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
     """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 
     build a list of Chain() objects mapped to Rfam families, by expanding available mappings 
     of any element of the list to all the list elements.
@@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
 
     # Split the comma-separated list of chain codes into chain codes:
     eq_class = codelist[0]
-     codes = codelist[1].replace('+',',').split(',')
+     codes = codelist[1].replace('+', ',').split(',')
 
     # Search for mappings that apply to an element of this PDB chains list:
     for c in codes:
         # search for Rfam mappings with this chain c:
         m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:]
-         m = allmappings.loc[m_row_indices].drop(['bit_score','evalue_score','cm_start','cm_end','hex_colour'], axis=1)
+         m = allmappings.loc[m_row_indices].drop(['bit_score', 'evalue_score', 'cm_start', 'cm_end', 'hex_colour'], axis=1)
         if len(m):
             # remove the found mappings from the dataframe
             allmappings = allmappings.loc[m_row_indices == False]
@@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
         families = set(known_mappings['rfam_acc'])
 
         # generalize
-         inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates()
+         inferred_mappings = known_mappings.drop(['pdb_id', 'chain'], axis=1).drop_duplicates()
 
         # check for approximative redundancy:
         if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
@@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
             # ==> Summarize them in one mapping but with the largest window.
             for rfam in families:
                 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
-                 thisfam_5_3 =  (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
-                 thisfam_3_5 =  (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
+                 thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
+                 thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
 
                 if (
-                         len(inferred_mappings[thisfam_5_3]) !=  len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ])
+                         len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[inferred_mappings['rfam_acc'] == rfam])
                     and len(inferred_mappings[thisfam_5_3]) > 0
                 ):
                     # there are mappings in both directions... wtf Rfam ?!
@@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
                         # We keep only the 5->3 sense.
                         inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
                         sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
-                         thisfam_5_3 =  (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
-                         thisfam_3_5 =  (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
+                         thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
+                         thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
                         print()
                         warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
                     else:
@@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
 
                 # Compute consensus for chains in 5' -> 3' sense
                 if len(inferred_mappings[thisfam_5_3]):
-                     pdb_start_min = min(inferred_mappings[ thisfam_5_3]['pdb_start'])
-                     pdb_end_max = max(inferred_mappings[ thisfam_5_3]['pdb_end']) 
-                     pdb_start_max = max(inferred_mappings[ thisfam_5_3]['pdb_start'])
-                     pdb_end_min = min(inferred_mappings[ thisfam_5_3]['pdb_end'])
+                     pdb_start_min = min(inferred_mappings[thisfam_5_3]['pdb_start'])
+                     pdb_end_max = max(inferred_mappings[thisfam_5_3]['pdb_end'])
+                     pdb_start_max = max(inferred_mappings[thisfam_5_3]['pdb_start'])
+                     pdb_end_min = min(inferred_mappings[thisfam_5_3]['pdb_end'])
                     if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
                         # the variation is only a few nucleotides, we take the largest window.
-                         inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = pdb_start_min
-                         inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = pdb_end_max
+                         inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = pdb_start_min
+                         inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = pdb_end_max
                     else:
                         # there probably is an outlier. We chose the median value in the whole list of known_mappings.
-                         known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
-                         inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
-                         inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
+                         known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
+                         inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
+                         inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
 
                 #  Compute consensus for chains in 3' -> 5' sense
                 if len(inferred_mappings[thisfam_3_5]):
-                     pdb_start_min = min(inferred_mappings[ thisfam_3_5]['pdb_start'])
-                     pdb_end_max = max(inferred_mappings[ thisfam_3_5]['pdb_end']) 
-                     pdb_start_max = max(inferred_mappings[ thisfam_3_5]['pdb_start'])
-                     pdb_end_min = min(inferred_mappings[ thisfam_3_5]['pdb_end'])
+                     pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start'])
+                     pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end'])
+                     pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start'])
+                     pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end'])
                     if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
                         # the variation is only a few nucleotides, we take the largest window.
-                         inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = pdb_start_max
-                         inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = pdb_end_min
+                         inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max
+                         inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min
                     else:
                         # there probably is an outlier. We chose the median value in the whole list of known_mappings.
-                         known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
-                         inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
-                         inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
+                         known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
+                         inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
+                         inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
             inferred_mappings.drop_duplicates(inplace=True)
 
         # Now build Chain() objects for the mapped chains
@@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
             pdb_chain_id = nr[2]
             for rfam in families:
                 # if a known mapping of this chain on this family exists, apply it
-                 m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ]
+                 this_chain_idxs = (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:])
+                 m = known_mappings.loc[this_chain_idxs & (known_mappings['rfam_acc'] == rfam)]
                 if len(m) and len(m) < 2:
                     pdb_start = int(m.pdb_start)
                     pdb_end = int(m.pdb_end)
@@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
                     pdb_start = int(m.pdb_start.min())
                     pdb_end = int(m.pdb_end.max())
                     inferred = False
-                 elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping
-                     pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
-                     pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
+                 elif (fullinference or not(this_chain_idxs.any())): 
+                     # if no known mapping on another family, use the inferred mapping
+                     # idem if the user said to do so with --full-inference
+                     pdb_start = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_start)
+                     pdb_end = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_end)
                     inferred = True
+                 else:
+                     # skip this family, we cannot map this chain to it.
+                     continue
                 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
 
                 # Check if the chain exists in the database
                 if update_only:
                     with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
-                         res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
+                         res = sql_ask_database(conn, f"""SELECT chain_id from chain 
+                                                          WHERE structure_id='{pdb_id}' 
+                                                          AND chain_name='{pdb_chain_id}' 
+                                                          AND rfam_acc='{rfam}' 
+                                                          AND issue=0""")
                     if not len(res):  # the chain is NOT yet in the database, or this is a known issue
-                         newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
+                         newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
+                                                rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
                 else:
-                     newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
+                     newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
+                                            rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
 
     return newchains
 
+ 
 @trace_unhandled_exceptions
 def work_mmcif(pdb_id):
     """ Look for a CIF file (with all chains) from RCSB
@@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id):
 
     # Attempt to download it if not present
     try:
-         if not path.isfile(final_filepath):
-             subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+         if not os.path.isfile(final_filepath):
+             subprocess.run(
+                 ["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath],
+                 stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+             )
     except:
         warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
         return
@@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id):
     # if not, read the CIF header and register the structure
     if not len(r):
         # Load the MMCIF file with Biopython
-         mmCif_info = MMCIF2Dict(final_filepath)
+         mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
 
         # Get info about that structure
         try:
@@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id):
         # Save into the database
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
-                                 VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
+                                 VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso))
 
-     if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
+     if not os.path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
 
         # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
         output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"],
@@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id):
             return 1
 
         # save the analysis to file only if we can load it :/
-         json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
+         json_file = open(path_to_3D_data + "annotations/" +
+                          pdb_id + ".json", "w")
         json_file.write(stdout)
         json_file.close()
 
     return 0
 
+ 
 @trace_unhandled_exceptions
 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     """Reads information from JSON and save it to database.
     If asked, also extracts the 3D chains from their original structure files.
- 
     """
 
     setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
 
-     if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
+     if not os.path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
         warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
         c.delete_me = True
         c.error_messages += f"Could not download and/or find annotations for {c.chain_label}."
@@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
 
     return c
 
+ 
 @trace_unhandled_exceptions
 def work_prepare_sequences(dl, rfam_acc, chains):
-     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
+     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
+     """
 
     setproctitle("RNAnet.py work_prepare_sequences()")
 
     if rfam_acc in LSU_set | SSU_set:  # rRNA
-         if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
+         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
-             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
-             existing_ids = [ r.id for r in existing_afa ]
+             existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
+             existing_ids = [r.id for r in existing_afa]
             del existing_afa
-             new_ids = [ str(c) for c in chains ]
-             doublons = [ i for i in existing_ids if i in new_ids ]
+             new_ids = [str(c) for c in chains]
+             doublons = [i for i in existing_ids if i in new_ids]
             del existing_ids, new_ids
             if len(doublons):
-                 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
-                 seqfile = SeqIO.parse(fasta, "fasta")
+                 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
+                 seqfile = Bio.SeqIO.parse(fasta, "fasta")
+                 # remove it and rewrite it with its own content filtered
                 os.remove(fasta)
                 with open(fasta, 'w') as f:
                     for rec in seqfile:
@@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
             for c in chains:
                 if len(c.seq_to_align):
-                     f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                     f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
         status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)"
 
- 
-     elif not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
+     elif not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
         # there was no previous aligned sequences, and we use cmalign.
         # So, we need to download homologous sequences from Rfam.
 
         # Extracting covariance model for this family
-         if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
+         if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
             with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f:
                 subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f)
             notify(f"Extracted {rfam_acc} covariance model (cmfetch)")
@@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         dl.download_Rfam_sequences(rfam_acc)
 
         # Prepare a FASTA file containing Rfamseq hits for that family
-         if path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):    # test if download succeeded
+         if os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded
 
             # gunzip the file
             with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz:
@@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
                 ids = set()
                 # Remove doublons from the Rfam hits
-                 for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
+                 for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
                     if r.id not in ids:
                         ids.add(r.id)
                         plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
                 # Add the 3D chains sequences
                 for c in chains:
                     if len(c.seq_to_align):
-                         plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                         plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
 
             del file_content
             # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa")
@@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains):
         with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f:
             for c in chains:
                 if len(c.seq_to_align):
-                     f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 
+                     f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
         status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)"
 
     # print some stats
     notify(status)
 
+ 
 @trace_unhandled_exceptions
 def work_realign(rfam_acc):
     """ Runs multiple sequence alignements by RNA family.
@@ -2209,10 +2325,10 @@ def work_realign(rfam_acc):
     else:
         # Align using Infernal for most RNA families
 
-         if path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
+         if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
             # Alignment exists. We just want to add new sequences into it.
 
-             if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
+             if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
                 # there are no new sequences to align...
                 return
 
@@ -2227,13 +2343,13 @@ def work_realign(rfam_acc):
             notify("Aligned new sequences together")
 
             # Detect doublons and remove them
-             existing_stk = AlignIO.read(existing_ali_path, "stockholm")
-             existing_ids = [ r.id for r in existing_stk ]
+             existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
+             existing_ids = [r.id for r in existing_stk]
             del existing_stk
-             new_stk = AlignIO.read(new_ali_path, "stockholm")
-             new_ids = [ r.id for r in new_stk ]
+             new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
+             new_ids = [r.id for r in new_stk]
             del new_stk
-             doublons = [ i for i in existing_ids if i in new_ids ]
+             doublons = [i for i in existing_ids if i in new_ids]
             del existing_ids, new_ids
             if len(doublons):
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
@@ -2241,12 +2357,13 @@ def work_realign(rfam_acc):
                     toremove.write('\n'.join(doublons)+'\n')
                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                 p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+                 p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
+                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
                 os.remove(path_to_seq_data + "realigned/toremove.txt")
 
             # And we merge the two alignments
-             p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", 
-                                                 "--rna", existing_ali_path, new_ali_path ], 
+             p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
+                                  "--rna", existing_ali_path, new_ali_path],
                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
             stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
             subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
@@ -2263,7 +2380,7 @@ def work_realign(rfam_acc):
             p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc",
                                 '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
                                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                 path_to_seq_data + f"realigned/{rfam_acc}++.fa" ], 
+                                 path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
             stderr = p.stderr.decode("utf-8")
 
@@ -2277,7 +2394,9 @@ def work_realign(rfam_acc):
             print('\t'+validsymb, flush=True)
 
         # Convert Stockholm to aligned FASTA
-         subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
+         subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
+                         "--informat", "stockholm", 
+                         "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
         subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
 
     # Assert everything worked, or save an error
@@ -2288,6 +2407,7 @@ def work_realign(rfam_acc):
             with open(runDir + "/errors.txt", "a") as er:
                 er.write(f"Failed to realign {rfam_acc} (killed)")
 
+ 
 def summarize_position(counts):
     """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
     """
@@ -2303,15 +2423,15 @@ def summarize_position(counts):
             N += counts[char]  # number of ungapped residues
 
     if N:  # prevent division by zero if the column is only gaps
-         return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
+         return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N)  # other residues, or consensus (N, K, Y...)
     else:
         return (0, 0, 0, 0, 0)
 
+ 
 @trace_unhandled_exceptions
 def work_pssm(f, fill_gaps):
     """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
 
-     Also saves every chain of the family to file.
     Uses only 1 core, so this function can be called in parallel.
 
     """
@@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps):
 
     # get the chains of this family
     list_of_chains = rfam_acc_to_download[f]
-     chains_ids = [ str(c) for c in list_of_chains ]
+     chains_ids = [str(c) for c in list_of_chains]
 
     # Open the alignment
     try:
-         align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
+         align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
     except:
         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         with open(runDir + "/errors.txt", "a") as errf:
             errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
         return 1
 
- 
     # Compute statistics per column
     pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
     frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
@@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps):
 
     # Save the re_mappings
     conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
-     sql_execute(conn, "INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;", many=True, data=re_mappings)
+     sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
+                          VALUES (?, ?, ?) 
+                          ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
+                 many=True, data=re_mappings)
 
     # Save the useful columns in the database
-     data = [ (f, j) + frequencies[j-1] for j in sorted(columns_to_save) ]
+     data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
                          VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
@@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps):
         pbar.close()
         sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, 
                               is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
-                               WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps)
+                               WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
 
     conn.close()
     idxQueue.put(thr_idx)  # replace the thread index in the queue
     return 0
 
+ 
 @trace_unhandled_exceptions
 def work_save(c, homology=True):
 
@@ -2451,9 +2574,11 @@ def work_save(c, homology=True):
 
     df.to_csv(filename, float_format="%.2f", index=False)
 
+ 
 if __name__ == "__main__":
 
-     runDir = path.dirname(path.realpath(__file__))
+     runDir = os.getcwd()
+     fileDir = os.path.dirname(os.path.realpath(__file__))
     ncores = read_cpu_number()
     pp = Pipeline()
     pp.process_options()
@@ -2502,7 +2627,6 @@ if __name__ == "__main__":
         print("Completed.")
         exit(0)
 
-     
     # At this point, structure, chain and nucleotide tables of the database are up to date.
     # (Modulo some statistics computed by statistics.py)
 
@@ -2511,13 +2635,14 @@ if __name__ == "__main__":
     # ===========================================================================
 
     if pp.SELECT_ONLY is None:
-         pp.checkpoint_load_chains()  # If your job failed, you can comment all the "3D information" part and start from here.
+         # If your job failed, you can comment all the "3D information" part and start from here.
+         pp.checkpoint_load_chains()
 
     # Get the list of Rfam families found
     rfam_acc_to_download = {}
     for c in pp.loaded_chains:
         if c.mapping.rfam_acc not in rfam_acc_to_download:
-             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
+             rfam_acc_to_download[c.mapping.rfam_acc] = [c]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
 
@@ -2546,5 +2671,5 @@ if __name__ == "__main__":
 
     print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
 
-     # # so i can sleep for the end of the night
+     # so i can sleep for the end of the night
     # subprocess.run(["poweroff"])
--- a/known_issues.txt
View file @7196427
+++ b/known_issues.txt
View file @7196427
- 1ml5_1_a_1-2914
- 1ml5_1_a_151-2903
- 1ml5_1_A_7-1515
- 1ml5_1_A_2-1520
- 1ml5_1_A_7-1518
- 1ml5_1_b_5-121
+ 1eg0_1_O_1-73
 2rdo_1_A_3-118
 4v48_1_A9_3-118
 4v47_1_A9_3-118
- 6zmi_1_L8_1267-4755
- 6zm7_1_L8_1267-4755
- 6y6x_1_L8_1267-4755
- 6z6n_1_L8_1267-4755
- 6qzp_1_L8_1267-4755
- 6zme_1_L8_1267-4755
- 6z6l_1_L8_1267-4755
- 6ek0_1_L8_1267-4755
- 6zmo_1_L8_1267-4755
- 6z6m_1_L8_1267-4755
- 6ole_1_D_1267-4755
- 6om0_1_D_1267-4755
- 6y2l_1_L8_1267-4755
- 6lqm_1_8_1267-4755
- 6y0g_1_L8_1267-4755
- 6lu8_1_8_1267-4755
- 6lsr_1_8_1267-4755
- 6lss_1_8_1267-4755
- 6oli_1_D_1267-4755
- 6olg_1_A3_1267-4755
- 6y57_1_L8_1267-4755
- 5t2c_1_C_1267-4755
- 6om7_1_D_1267-4755
- 4ug0_1_L8_1267-4755
- 6olf_1_D_1267-4755
- 6ip5_1_1C_1267-4755
- 6ip8_1_1C_1267-4755
- 6olz_1_A3_1267-4755
- 5aj0_1_A3_1267-4755
- 5lks_1_L8_1267-4755
- 6ip6_1_1C_1267-4755
- 4v6x_1_A8_1267-4755
 1vy7_1_AY_1-73
 1vy7_1_CY_1-73
 4w2h_1_CY_1-73
- 2z9q_1_A_1-72
+ 1jgq_1_A_2-1520
+ 4v42_1_AA_2-1520
+ 1jgo_1_A_2-1520
+ 1jgp_1_A_2-1520
+ 1ml5_1_A_2-1520
+ 4v42_1_BA_1-2914
+ 1ml5_1_a_1-2914
 4v42_1_BB_5-121
+ 1ml5_1_b_5-121
+ 2rdo_1_B_1-2904
+ 4v48_1_A0_1-2904
+ 4v47_1_A0_1-2904
+ 4v48_1_BA_1-1543
+ 4v47_1_BA_1-1542
 1ls2_1_B_1-73
 3ep2_1_Y_1-72
 3eq3_1_Y_1-72
 4v48_1_A6_1-73
- 1eg0_1_O_1-73
+ 2z9q_1_A_1-72
 1gsg_1_T_1-72
 3jcr_1_H_1-115
- 4v42_1_BA_1-2914
- 4v42_1_BA_151-2903
- 4v48_1_BA_1-91
- 4v48_1_BA_6-1541
- 4v48_1_BA_1-1543
- 4v48_1_BA_6-1538
- 4v47_1_BA_1-91
- 4v47_1_BA_6-1540
- 4v47_1_BA_1-1542
- 4v47_1_BA_6-1537
- 2rdo_1_B_1-2903
- 2rdo_1_B_6-1460
- 2rdo_1_B_1-1528
- 2rdo_1_B_6-1457
- 2rdo_1_B_160-2893
- 2rdo_1_B_1-2904
- 2rdo_1_B_6-1522
- 4v48_1_A0_1-2903
- 4v48_1_A0_6-1460
- 4v48_1_A0_1-1528
- 4v48_1_A0_6-1457
- 4v48_1_A0_160-2893
- 4v48_1_A0_1-2904
- 4v48_1_A0_6-1522
- 4v47_1_A0_1-2903
- 4v47_1_A0_6-1460
- 4v47_1_A0_1-1528
- 4v47_1_A0_6-1457
- 4v47_1_A0_160-2893
- 4v47_1_A0_1-2904
- 4v47_1_A0_6-1522
+ 1x1l_1_A_1-132
+ 1zc8_1_Z_1-93
+ 2ob7_1_D_1-132
 2ob7_1_A_10-319
- 1x1l_1_A_1-130
- 1zc8_1_Z_1-130
- 1zc8_1_Z_1-91
- 2ob7_1_D_1-130
- 6rxu_1_C2_588-2386
- 6rxu_1_C2_583-2388
- 6rxu_1_C2_588-2383
- 5oql_1_2_588-2386
- 5oql_1_2_583-2388
- 5oql_1_2_588-2383
- 6rxv_1_C2_588-2386
- 6rxv_1_C2_583-2388
- 6rxv_1_C2_588-2383
- 6rxz_1_C2_588-2386
- 6rxz_1_C2_583-2388
- 6rxz_1_C2_588-2383
- 6rxy_1_C2_588-2386
- 6rxy_1_C2_583-2388
- 6rxy_1_C2_588-2383
- 6rxt_1_C2_588-2386
- 6rxt_1_C2_583-2388
- 6rxt_1_C2_588-2383
 1r2x_1_C_1-58
 1r2w_1_C_1-58
- 1eg0_1_L_1-57
 1eg0_1_L_1-56
- 1jgq_1_A_7-1518
- 1jgq_1_A_20-55
- 1jgq_1_A_2-1520
- 1jgq_1_A_7-1515
- 4v42_1_AA_7-1518
- 4v42_1_AA_20-55
- 4v42_1_AA_2-1520
- 4v42_1_AA_7-1515
- 1jgo_1_A_7-1518
- 1jgo_1_A_20-55
- 1jgo_1_A_2-1520
- 1jgo_1_A_7-1515
- 1jgp_1_A_7-1518
- 1jgp_1_A_20-55
- 1jgp_1_A_2-1520
- 1jgp_1_A_7-1515
 1zc8_1_A_1-59
- 1mvr_1_D_1-59
- 4c9d_1_D_29-1
- 4c9d_1_C_29-1
- 4adx_1_9_1-121
+ 1mvr_1_D_1-61
+ 4adx_1_9_1-123
 1zn1_1_B_1-59
 1emi_1_B_1-108
 3iy9_1_A_498-1027
@@ -143,25 +49,1558 @@
 3cw1_1_V_1-138
 3cw1_1_v_1-138
 2iy3_1_B_9-105
- 3jcr_1_N_1-188
- 3jcr_1_N_1-106
+ 3jcr_1_N_1-107
 2vaz_1_A_64-177
- 2ftc_1_R_1-1568
- 2ftc_1_R_792-1568
 2ftc_1_R_81-1466
 3jcr_1_M_1-141
- 3jcr_1_M_1-188
- 3jcr_1_M_1-107
- 4v5z_1_B0_1-2899
 4v5z_1_B0_1-2902
- 4v5z_1_B0_1-2840
 5g2x_1_A_595-692
 3iy8_1_A_1-540
 4v5z_1_BY_2-113
 4v5z_1_BZ_1-70
- 1mvr_1_B_1-96
- 4adx_1_0_1-2923
- 4adx_1_0_132-2915
+ 4v5z_1_B1_2-125
+ 1mvr_1_B_3-96
+ 4adx_1_0_1-2925
 3eq4_1_Y_1-69
+ 6uz7_1_8_2140-2827
 4v5z_1_AA_1-1563
- 4v5z_1_AA_1-1562
+ 6cfj_1_1X
+ 6cfj_1_2X
+ 5hcq_1_1X
+ 6cae_1_1X
+ 5hcq_1_2X
+ 5hcr_1_1X
+ 4z8c_1_1X
+ 5j4b_1_1X
+ 5j4b_1_2X
+ 4z8c_1_2X
+ 6cae_1_2X
+ 5j4c_1_1X
+ 5w4k_1_1X
+ 6of1_1_1X
+ 5hcr_1_2X
+ 5hd1_1_1X
+ 5hcp_1_1X
+ 6of1_1_2X
+ 5hau_1_1W
+ 5j4c_1_2X
+ 5wis_1_1X
+ 6xqd_1_1X
+ 6nd5_1_1X
+ 5w4k_1_2X
+ 5hau_1_2W
+ 6xqd_1_2X
+ 4y4p_1_1X
+ 6o97_1_1X
+ 5hcp_1_2X
+ 5doy_1_1X
+ 4zer_1_1X
+ 5wit_1_1X
+ 5hd1_1_2X
+ 6nd5_1_2X
+ 4z3s_1_1X
+ 7jql_1_1X
+ 7jqm_1_1X
+ 7jql_1_2X
+ 5wis_1_2X
+ 6nd6_1_1X
+ 6o97_1_2X
+ 4y4p_1_2X
+ 7jqm_1_2X
+ 4z3s_1_2X
+ 4zer_1_2X
+ 6uo1_1_2X
+ 6uo1_1_1X
+ 5doy_1_2X
+ 5wit_1_2X
+ 5f8k_1_1X
+ 6nd6_1_2X
+ 6xqe_1_1X
+ 6xqe_1_2X
+ 6n9e_1_1X
+ 6n9e_1_2X
+ 6n9f_1_1X
+ 5f8k_1_2X
+ 6n9f_1_2X
+ 6xz7_1_F
+ 6y69_1_W
+ 5afi_1_V
+ 5afi_1_W
+ 6h4n_1_W
+ 5wdt_1_V
+ 5wfs_1_V
+ 5wdt_1_W
+ 5wfs_1_W
+ 5we4_1_V
+ 5we4_1_W
+ 5uq8_1_Y
+ 6c4i_1_Y
+ 6c4i_1_X
+ 5zeb_1_V
+ 5zep_1_W
+ 5lzd_1_V
+ 5we6_1_V
+ 5wfk_1_V
+ 5wfk_1_W
+ 5we6_1_W
+ 5u4i_1_Y
+ 5uq7_1_Y
+ 5u4i_1_X
+ 5lza_1_V
+ 5wf0_1_V
+ 5wf0_1_W
+ 5zeu_1_V
+ 5l3p_1_X
+ 3jcj_1_V
+ 6gxm_1_X
+ 6gwt_1_X
+ 6gxn_1_X
+ 6gxo_1_X
+ 3j9y_1_V
+ 6o9k_1_Y
+ 6o7k_1_V
+ 5lzf_1_V
+ 3jcn_1_V
+ 5lzc_1_V
+ 5u4j_1_X
+ 5u4j_1_Z
+ 5lzb_1_V
+ 6h58_1_W
+ 6h58_1_WW
+ 1eg0_1_O
+ 5j8b_1_X
+ 4v7j_1_AV
+ 4v7j_1_BV
+ 4v7k_1_BV
+ 4v7k_1_AV
+ 4v7k_1_BW
+ 4v7k_1_AW
+ 4v7j_1_AW
+ 4v7j_1_BW
+ 4v4j_1_Z
+ 6i0v_1_B
+ 5k77_1_X
+ 5k77_1_V
+ 5k77_1_Y
+ 5k77_1_W
+ 5k77_1_Z
+ 4pei_1_X
+ 4pei_1_V
+ 4pei_1_W
+ 4pei_1_Z
+ 4pei_1_Y
+ 4a3c_1_P
+ 4a3e_1_P
+ 6lkq_1_U
+ 7k00_1_B
+ 6qdw_1_A
+ 2rdo_1_A
+ 4v48_1_A9
+ 4v47_1_A9
+ 6hcj_1_Q3
+ 6hcq_1_Q3
+ 5mmm_1_Z
+ 4w2e_1_W
+ 5j4b_1_1Y
+ 6cfj_1_1W
+ 5w4k_1_1Y
+ 5wit_1_1W
+ 6cfj_1_1Y
+ 6cfj_1_2W
+ 5j4c_1_1W
+ 5wis_1_1Y
+ 5j4c_1_1Y
+ 6cfj_1_2Y
+ 5wis_1_1W
+ 5j4b_1_1W
+ 5j4c_1_2W
+ 5j4b_1_2W
+ 5j4b_1_2Y
+ 5j4c_1_2Y
+ 5w4k_1_1W
+ 6nd5_1_1Y
+ 5wis_1_2Y
+ 5wit_1_2W
+ 5doy_1_1Y
+ 5w4k_1_2Y
+ 4y4p_1_1Y
+ 4z3s_1_1Y
+ 5doy_1_1W
+ 5doy_1_2Y
+ 6nd5_1_1W
+ 4z3s_1_2Y
+ 4z3s_1_1W
+ 5w4k_1_2W
+ 6nd5_1_2Y
+ 4y4p_1_2Y
+ 6uo1_1_2Y
+ 6uo1_1_2W
+ 4y4p_1_1W
+ 4z3s_1_2W
+ 6uo1_1_1Y
+ 6uo1_1_1W
+ 5wis_1_2W
+ 5wit_1_1Y
+ 6nd5_1_2W
+ 4y4p_1_2W
+ 5doy_1_2W
+ 5wit_1_2Y
+ 6ucq_1_1Y
+ 4v4i_1_Z
+ 6ucq_1_1X
+ 6ucq_1_2Y
+ 4w2e_1_X
+ 6ucq_1_2X
+ 6yss_1_W
+ 5afi_1_Y
+ 5uq8_1_Z
+ 5wdt_1_Y
+ 5wfs_1_Y
+ 6ysr_1_W
+ 5we4_1_Y
+ 6yst_1_W
+ 5uq7_1_Z
+ 5we6_1_Y
+ 5wfk_1_Y
+ 5wf0_1_Y
+ 6o9j_1_V
+ 6ysu_1_W
+ 3j46_1_A
+ 5j8b_1_Y
+ 5j8b_1_W
+ 3bbv_1_Z
+ 5aj0_1_BV
+ 5aj0_1_BW
+ 4wt8_1_AB
+ 4wt8_1_BB
+ 4v4j_1_Y
+ 4v4i_1_Y
+ 5uq8_1_X
+ 5uq7_1_X
+ 1jgq_1_A
+ 4v42_1_AA
+ 1jgo_1_A
+ 1jgp_1_A
+ 1ml5_1_A
+ 4v4j_1_W
+ 4v4i_1_W
+ 4v42_1_BA
+ 4wt8_1_CS
+ 4wt8_1_DS
+ 4v4j_1_X
+ 4v4i_1_X
+ 4v42_1_BB
+ 6uu4_1_333
+ 6uu0_1_333
+ 6uuc_1_333
+ 6uu2_1_333
+ 6b6h_1_3
+ 6pb4_1_3
+ 6d30_1_C
+ 6j7z_1_C
+ 3er9_1_D
+ 5kal_1_Y
+ 4nia_1_3
+ 5kal_1_Z
+ 4nia_1_7
+ 4nia_1_4
+ 5new_1_C
+ 4nia_1_U
+ 4nia_1_6
+ 4oq9_1_7
+ 4nia_1_1
+ 4oq9_1_4
+ 4nia_1_8
+ 4oq9_1_8
+ 4nia_1_5
+ 2vrt_1_E
+ 4nia_1_W
+ 4oq9_1_6
+ 4oq8_1_D
+ 4nia_1_Z
+ 4oq9_1_W
+ 4oq9_1_5
+ 4nia_1_2
+ 2vrt_1_F
+ 4oq9_1_U
+ 4oq9_1_Z
+ 4oq9_1_2
+ 4oq9_1_3
+ 1ddl_1_E
+ 4oq9_1_1
+ 6rt5_1_A
+ 6rt5_1_E
+ 4qu6_1_B
+ 6lkq_1_T
+ 6qdw_1_B
+ 3jbv_1_B
+ 3jbu_1_B
+ 2rdo_1_B
+ 4v48_1_A0
+ 4v47_1_A0
+ 6do8_1_B
+ 6dpi_1_B
+ 6dp9_1_B
+ 6dpb_1_B
+ 6dmn_1_B
+ 6dpp_1_B
+ 6dpk_1_B
+ 6dpd_1_B
+ 6dot_1_B
+ 6dok_1_B
+ 6dp8_1_B
+ 6dpl_1_B
+ 6dpg_1_B
+ 6dou_1_B
+ 6dpc_1_B
+ 6do9_1_B
+ 6dmv_1_B
+ 6dp4_1_B
+ 6dpn_1_B
+ 6doj_1_B
+ 6dph_1_B
+ 6dos_1_B
+ 6doo_1_B
+ 6dp6_1_B
+ 6dox_1_B
+ 6dp5_1_B
+ 6dol_1_B
+ 6dp1_1_B
+ 6doz_1_B
+ 6dp7_1_B
+ 6doq_1_B
+ 6dpa_1_B
+ 6dom_1_B
+ 6dog_1_B
+ 6dop_1_B
+ 6doh_1_B
+ 6doa_1_B
+ 6don_1_B
+ 6dov_1_B
+ 6dpo_1_B
+ 6dod_1_B
+ 6dob_1_B
+ 6dow_1_B
+ 6dpm_1_B
+ 6dpf_1_B
+ 6dp3_1_B
+ 6dp2_1_B
+ 6dpe_1_B
+ 6dpj_1_B
+ 6dor_1_B
+ 6dof_1_B
+ 6dp0_1_B
+ 6doi_1_B
+ 6doc_1_B
+ 6doe_1_B
+ 6n6g_1_D
+ 6lkq_1_S
+ 5h5u_1_H
+ 5lze_1_Y
+ 5lze_1_V
+ 5lze_1_X
+ 3jcj_1_G
+ 6o7k_1_G
+ 4v48_1_BA
+ 4v47_1_BA
+ 4b3r_1_W
+ 4b3t_1_W
+ 4b3s_1_W
+ 5o2r_1_X
+ 5kcs_1_1X
+ 6fti_1_U
+ 6fti_1_W
+ 6ftj_1_U
+ 6ftj_1_W
+ 6ftg_1_U
+ 6ftg_1_W
+ 6ole_1_T
+ 6om0_1_T
+ 6oli_1_T
+ 6om7_1_T
+ 6olf_1_T
+ 6w6l_1_T
+ 6x1b_1_D
+ 6x1b_1_F
+ 5f6c_1_C
+ 6i0t_1_B
+ 1b2m_1_C
+ 1b2m_1_D
+ 1b2m_1_E
+ 2uxc_1_Y
+ 4a3g_1_P
+ 4a3j_1_P
+ 7k00_1_5
+ 5mmi_1_Z
+ 3j9m_1_U
+ 6nu2_1_U
+ 6nu3_1_U
+ 5c0y_1_C
+ 6n6f_1_D
+ 4ohy_1_B
+ 4oi1_1_B
+ 4oi0_1_B
+ 6raz_1_Y
+ 5ipl_1_3
+ 6utw_1_333
+ 5ipm_1_3
+ 5ipn_1_3
+ 4ylo_1_3
+ 4yln_1_6
+ 4ylo_1_6
+ 4yln_1_3
+ 4yln_1_9
+ 5lzf_1_Y
+ 1n32_1_Z
+ 5zsl_1_D
+ 5zsd_1_C
+ 5zsd_1_D
+ 5zsl_1_E
+ 4nku_1_D
+ 4nku_1_H
+ 1cwp_1_E
+ 6qik_1_Y
+ 6rzz_1_Y
+ 6ri5_1_Y
+ 6qt0_1_Y
+ 6qtz_1_Y
+ 6t83_1_1B
+ 6t83_1_3B
+ 6t83_1_AA
+ 6t83_1_CA
+ 6s05_1_Y
+ 5jcs_1_X
+ 5fl8_1_X
+ 3erc_1_G
+ 6of1_1_1W
+ 6cae_1_1Y
+ 6o97_1_1W
+ 6of1_1_1Y
+ 6of1_1_2W
+ 6o97_1_1Y
+ 6nd6_1_1Y
+ 6cae_1_1W
+ 6of1_1_2Y
+ 6cae_1_2Y
+ 6nd6_1_1W
+ 6cae_1_2W
+ 6o97_1_2Y
+ 6nd6_1_2Y
+ 6o97_1_2W
+ 6nd6_1_2W
+ 6xz7_1_G
+ 6gz5_1_BW
+ 6gz3_1_BW
+ 1ls2_1_B
+ 3ep2_1_Y
+ 3eq3_1_Y
+ 4v48_1_A6
+ 2z9q_1_A
+ 4hot_1_X
+ 6d2z_1_C
+ 4tu0_1_F
+ 4tu0_1_G
+ 6r9o_1_B
+ 6is0_1_C
+ 5lzc_1_X
+ 5lzb_1_X
+ 5lzd_1_Y
+ 5lzc_1_Y
+ 5lzb_1_Y
+ 1gsg_1_T
+ 6zvi_1_D
+ 6sv4_1_NB
+ 6sv4_1_NC
+ 6i7o_1_NB
+ 5y88_1_X
+ 3j6x_1_IR
+ 3j6y_1_IR
+ 6tb3_1_N
+ 6tnu_1_N
+ 2uxb_1_X
+ 2x1f_1_B
+ 2x1a_1_B
+ 3eq3_1_D
+ 3ep2_1_D
+ 1eg0_1_M
+ 3eq4_1_D
+ 5o1y_1_B
+ 3jcr_1_H
+ 6dzi_1_H
+ 5zeu_1_A
+ 6mpi_1_W
+ 5mfx_1_B
+ 5w0m_1_J
+ 5bud_1_E
+ 5w0m_1_I
+ 5w0m_1_H
+ 4j7m_1_B
+ 5bud_1_D
+ 6a4e_1_B
+ 6a4e_1_D
+ 6hxx_1_AA
+ 6hxx_1_AB
+ 6hxx_1_AC
+ 6hxx_1_AD
+ 6hxx_1_AE
+ 6hxx_1_AF
+ 6hxx_1_AG
+ 6hxx_1_AH
+ 6hxx_1_AI
+ 6hxx_1_AJ
+ 6hxx_1_AK
+ 6hxx_1_AL
+ 6hxx_1_AM
+ 6hxx_1_AN
+ 6hxx_1_AO
+ 6hxx_1_AP
+ 6hxx_1_AQ
+ 6hxx_1_AR
+ 6hxx_1_AS
+ 6hxx_1_AT
+ 6hxx_1_AU
+ 6hxx_1_AV
+ 6hxx_1_AW
+ 6hxx_1_AX
+ 6hxx_1_AY
+ 6hxx_1_AZ
+ 6hxx_1_BA
+ 6hxx_1_BB
+ 6hxx_1_BC
+ 6hxx_1_BD
+ 6hxx_1_BE
+ 6hxx_1_BF
+ 6hxx_1_BG
+ 6hxx_1_BH
+ 6hxx_1_BI
+ 5odv_1_A
+ 5odv_1_B
+ 5odv_1_C
+ 5odv_1_D
+ 5odv_1_E
+ 5odv_1_F
+ 5odv_1_G
+ 5odv_1_H
+ 5odv_1_I
+ 5odv_1_J
+ 5odv_1_K
+ 5odv_1_L
+ 5odv_1_M
+ 5odv_1_N
+ 5odv_1_O
+ 5odv_1_P
+ 5odv_1_Q
+ 5odv_1_R
+ 5odv_1_S
+ 5odv_1_T
+ 5odv_1_U
+ 5odv_1_V
+ 5odv_1_W
+ 5odv_1_X
+ 6t34_1_A
+ 6t34_1_B
+ 6t34_1_C
+ 6t34_1_D
+ 6t34_1_E
+ 6t34_1_F
+ 6t34_1_G
+ 6t34_1_H
+ 6t34_1_I
+ 6t34_1_J
+ 6t34_1_K
+ 6t34_1_L
+ 6t34_1_M
+ 6t34_1_N
+ 6t34_1_O
+ 6t34_1_P
+ 6t34_1_Q
+ 6t34_1_R
+ 6t34_1_S
+ 6ip8_1_ZY
+ 6ip5_1_ZY
+ 6ip5_1_ZU
+ 6ip6_1_ZY
+ 6ip8_1_ZZ
+ 6ip6_1_ZZ
+ 6uu3_1_333
+ 6uu1_1_333
+ 1pn8_1_D
+ 3er8_1_H
+ 3er8_1_G
+ 3er8_1_F
+ 5o3j_1_B
+ 4dr7_1_B
+ 1i5l_1_Y
+ 1i5l_1_U
+ 4dr6_1_B
+ 6i2n_1_U
+ 4v68_1_A0
+ 6vyu_1_Y
+ 6vyw_1_Y
+ 6vz7_1_Y
+ 6vz5_1_Y
+ 6vz3_1_Y
+ 6vyy_1_Y
+ 6vyx_1_Y
+ 6vyz_1_Y
+ 6vz2_1_Y
+ 1mvr_1_1
+ 6vyt_1_Y
+ 1cgm_1_I
+ 3jb7_1_T
+ 3jb7_1_M
+ 3j0o_1_D
+ 3j0l_1_D
+ 3j0q_1_D
+ 3j0p_1_D
+ 5elt_1_F
+ 5elt_1_E
+ 2tmv_1_R
+ 5a79_1_R
+ 5a7a_1_R
+ 2om3_1_R
+ 2xea_1_R
+ 4wtl_1_T
+ 4wtl_1_P
+ 1xnq_1_W
+ 1x18_1_C
+ 1x18_1_B
+ 1x18_1_D
+ 1vq6_1_4
+ 4am3_1_D
+ 4am3_1_H
+ 4am3_1_I
+ 4lj0_1_C
+ 4lj0_1_D
+ 4lj0_1_E
+ 5lzy_1_HH
+ 4wtj_1_T
+ 4wtj_1_P
+ 4xbf_1_D
+ 6ow3_1_I
+ 6ovy_1_I
+ 6oy6_1_I
+ 6n6d_1_D
+ 6n6k_1_C
+ 6n6k_1_D
+ 3rtj_1_D
+ 1apg_1_D
+ 6ty9_1_M
+ 6tz1_1_N
+ 4bbl_1_Y
+ 4bbl_1_Z
+ 6sce_1_B
+ 6scf_1_I
+ 6scf_1_K
+ 6yud_1_K
+ 6yud_1_O
+ 6scf_1_M
+ 6yud_1_P
+ 6scf_1_L
+ 6yud_1_M
+ 6yud_1_Q
+ 6o6x_1_D
+ 4ba2_1_R
+ 6o6x_1_C
+ 6o7b_1_C
+ 6o6v_1_C
+ 6r7b_1_D
+ 6r9r_1_D
+ 6ov0_1_E
+ 6ov0_1_H
+ 6ov0_1_G
+ 6o6v_1_D
+ 6ov0_1_F
+ 6o7b_1_D
+ 5e02_1_C
+ 6r9r_1_E
+ 6r7b_1_E
+ 6o7i_1_I
+ 6o7h_1_K
+ 7jyy_1_F
+ 7jyy_1_E
+ 7jz0_1_F
+ 7jz0_1_E
+ 6rt6_1_A
+ 6rt6_1_E
+ 1y1y_1_P
+ 5zuu_1_I
+ 5zuu_1_G
+ 4peh_1_W
+ 4peh_1_V
+ 4peh_1_X
+ 4peh_1_Y
+ 4peh_1_Z
+ 6mkn_1_W
+ 4cxg_1_C
+ 4cxh_1_C
+ 1x1l_1_A
+ 1zc8_1_Z
+ 2ob7_1_D
+ 2ob7_1_A
+ 4eya_1_E
+ 4eya_1_F
+ 4eya_1_Q
+ 4eya_1_R
+ 2r1g_1_B
+ 4ht9_1_E
+ 1cvj_1_M
+ 6z1p_1_AB
+ 6z1p_1_AA
+ 4ii9_1_C
+ 5mq0_1_3
+ 5uk4_1_X
+ 5uk4_1_V
+ 5uk4_1_W
+ 5uk4_1_U
+ 5f6c_1_E
+ 4rcj_1_B
+ 1xnr_1_W
+ 6e0o_1_C
+ 6o75_1_D
+ 6o75_1_C
+ 6e0o_1_B
+ 3j06_1_R
+ 1r2x_1_C
+ 1r2w_1_C
+ 1eg0_1_L
+ 4eya_1_G
+ 4eya_1_H
+ 4eya_1_S
+ 4eya_1_T
+ 4dr4_1_V
+ 1ibl_1_Z
+ 1ibm_1_Z
+ 4dr5_1_V
+ 4d61_1_J
+ 1trj_1_B
+ 1trj_1_C
+ 6q8y_1_N
+ 6sv4_1_N
+ 6i7o_1_N
+ 5k8h_1_A
+ 5z4a_1_B
+ 3jbu_1_V
+ 1h2c_1_R
+ 1h2d_1_S
+ 1h2d_1_R
+ 6szs_1_X
+ 5mgp_1_X
+ 6enu_1_X
+ 6enf_1_X
+ 6enj_1_X
+ 1pvo_1_L
+ 1pvo_1_G
+ 1pvo_1_H
+ 1pvo_1_J
+ 1pvo_1_K
+ 2ht1_1_K
+ 2ht1_1_J
+ 6eri_1_AX
+ 1zc8_1_A
+ 1zc8_1_C
+ 1zc8_1_B
+ 1zc8_1_G
+ 1zc8_1_I
+ 1zc8_1_H
+ 1zc8_1_J
+ 4v8z_1_CX
+ 6kqe_1_I
+ 5uh8_1_I
+ 5vi5_1_Q
+ 4xln_1_T
+ 4xlr_1_T
+ 4xln_1_Q
+ 5i2d_1_K
+ 5i2d_1_V
+ 4xlr_1_Q
+ 6sty_1_C
+ 6sty_1_F
+ 2xs5_1_D
+ 3ok4_1_N
+ 3ok4_1_L
+ 3ok4_1_Z
+ 3ok4_1_4
+ 3ok4_1_V
+ 3ok4_1_X
+ 3ok4_1_P
+ 3ok4_1_H
+ 3ok4_1_J
+ 3ok4_1_R
+ 3ok4_1_T
+ 3ok4_1_2
+ 6n6h_1_D
+ 5wnt_1_B
+ 3b0u_1_B
+ 3b0u_1_A
+ 4x9e_1_G
+ 4x9e_1_H
+ 6z1p_1_BB
+ 6z1p_1_BA
+ 2uxd_1_X
+ 4qvd_1_H
+ 4v7e_1_AB
+ 3ol9_1_D
+ 3ol9_1_H
+ 3ol9_1_L
+ 3ol9_1_P
+ 3olb_1_L
+ 3olb_1_P
+ 3olb_1_D
+ 3olb_1_H
+ 3ol6_1_D
+ 3ol6_1_H
+ 3ol6_1_L
+ 3ol6_1_P
+ 3ol8_1_D
+ 3ol8_1_H
+ 3ol7_1_L
+ 3ol7_1_P
+ 3ol7_1_D
+ 3ol7_1_H
+ 3ol8_1_L
+ 3ol8_1_P
+ 1qzc_1_C
+ 1qzc_1_A
+ 6ole_1_V
+ 6om0_1_V
+ 6oli_1_V
+ 6om7_1_V
+ 6w6l_1_V
+ 6olf_1_V
+ 1mvr_1_D
+ 4wtm_1_T
+ 4wtm_1_P
+ 5x70_1_E
+ 5x70_1_G
+ 6gz5_1_BV
+ 6gz4_1_BV
+ 6gz3_1_BV
+ 6fti_1_Q
+ 4v7e_1_AE
+ 4v7e_1_AD
+ 4x62_1_B
+ 4x64_1_B
+ 4x65_1_B
+ 1xmq_1_W
+ 4x66_1_B
+ 3t1h_1_W
+ 3t1y_1_W
+ 1xmo_1_W
+ 4adx_1_9
+ 6kr6_1_B
+ 1zn1_1_B
+ 6z8k_1_X
+ 1cvj_1_Q
+ 4csf_1_U
+ 4csf_1_Q
+ 4csf_1_G
+ 4csf_1_M
+ 4csf_1_K
+ 4csf_1_A
+ 4csf_1_I
+ 4csf_1_S
+ 4csf_1_C
+ 4csf_1_W
+ 4csf_1_O
+ 4csf_1_E
+ 1cvj_1_N
+ 1cvj_1_O
+ 1cvj_1_S
+ 1cvj_1_P
+ 1cvj_1_T
+ 1cvj_1_R
+ 6th6_1_AA
+ 6skg_1_AA
+ 6skf_1_AA
+ 6q8y_1_M
+ 6i7o_1_M
+ 6zmw_1_W
+ 6ybv_1_W
+ 2fz2_1_D
+ 2xpj_1_D
+ 2vrt_1_H
+ 2vrt_1_G
+ 1emi_1_B
+ 6r9m_1_B
+ 4nia_1_C
+ 4nia_1_A
+ 4nia_1_H
+ 4nia_1_N
+ 4nia_1_G
+ 4nia_1_D
+ 4nia_1_B
+ 4nia_1_I
+ 4nia_1_E
+ 4nia_1_M
+ 4oq9_1_I
+ 4oq9_1_G
+ 4oq9_1_C
+ 4oq9_1_H
+ 4oq9_1_N
+ 4oq9_1_A
+ 4oq9_1_D
+ 4oq9_1_E
+ 4oq9_1_M
+ 4oq9_1_B
+ 5uhc_1_I
+ 1uvn_1_F
+ 1uvn_1_B
+ 1uvn_1_D
+ 3iy9_1_A
+ 4wtk_1_T
+ 4wtk_1_P
+ 1vqn_1_4
+ 4oav_1_C
+ 4oav_1_A
+ 3ep2_1_E
+ 3eq3_1_E
+ 3eq4_1_E
+ 3ep2_1_A
+ 3eq3_1_A
+ 3eq4_1_A
+ 3ep2_1_C
+ 3eq3_1_C
+ 3eq4_1_C
+ 3ep2_1_B
+ 3eq3_1_B
+ 3eq4_1_B
+ 4i67_1_B
+ 3pgw_1_R
+ 3pgw_1_N
+ 3cw1_1_X
+ 3cw1_1_W
+ 3cw1_1_V
+ 5it9_1_I
+ 6k32_1_T
+ 6k32_1_P
+ 5mmj_1_A
+ 5x8r_1_A
+ 3j2k_1_3
+ 3j2k_1_2
+ 3j2k_1_1
+ 3j2k_1_0
+ 3j2k_1_4
+ 3nvk_1_G
+ 3nvk_1_S
+ 2iy3_1_B
+ 1cwp_1_F
+ 5z4j_1_B
+ 5gmf_1_E
+ 5gmf_1_H
+ 6e4p_1_J
+ 5gmf_1_F
+ 5gmf_1_G
+ 5gmg_1_D
+ 5gmg_1_C
+ 6e4p_1_K
+ 3ie1_1_E
+ 3ie1_1_H
+ 3ie1_1_F
+ 4dr7_1_V
+ 3ie1_1_G
+ 3s4g_1_C
+ 3s4g_1_B
+ 2qqp_1_R
+ 2zde_1_E
+ 2zde_1_F
+ 2zde_1_H
+ 2zde_1_G
+ 1nb7_1_E
+ 1nb7_1_F
+ 4hos_1_X
+ 3p6y_1_T
+ 3p6y_1_V
+ 3p6y_1_U
+ 3p6y_1_Q
+ 3p6y_1_W
+ 5dto_1_B
+ 4cxh_1_X
+ 1uvj_1_F
+ 1uvj_1_D
+ 1uvj_1_E
+ 6kqd_1_I
+ 6kqd_1_S
+ 5uh5_1_I
+ 1ytu_1_F
+ 1ytu_1_D
+ 4kzz_1_J
+ 5t2c_1_AN
+ 4v5z_1_BF
+ 3j6b_1_E
+ 4v4f_1_B6
+ 4v4f_1_A5
+ 4v4f_1_A3
+ 4v4f_1_B0
+ 4v4f_1_B9
+ 4v4f_1_A2
+ 4v4f_1_A8
+ 4v4f_1_A1
+ 4v4f_1_A9
+ 4v4f_1_BZ
+ 4v4f_1_B8
+ 4v4f_1_B7
+ 4v4f_1_B5
+ 4v4f_1_A0
+ 4v4f_1_A7
+ 4v4f_1_A4
+ 4v4f_1_AZ
+ 4v4f_1_B3
+ 4v4f_1_B1
+ 4v4f_1_B4
+ 4v4f_1_A6
+ 4v4f_1_B2
+ 5flx_1_Z
+ 5zsb_1_C
+ 5zsb_1_D
+ 5zsn_1_D
+ 5zsn_1_E
+ 3jcr_1_N
+ 6gfw_1_R
+ 2vaz_1_A
+ 1qzc_1_B
+ 1mvr_1_C
+ 4v5z_1_BP
+ 6n6e_1_D
+ 4g7o_1_I
+ 4g7o_1_S
+ 5x22_1_S
+ 5x22_1_I
+ 5x21_1_I
+ 5uh6_1_I
+ 6l74_1_I
+ 5uh9_1_I
+ 2ftc_1_R
+ 6sag_1_R
+ 4udv_1_R
+ 2r1g_1_E
+ 5zsc_1_D
+ 5zsc_1_C
+ 6woy_1_I
+ 6wox_1_I
+ 6evj_1_N
+ 6evj_1_M
+ 4gkk_1_W
+ 4v9e_1_AG
+ 4v9e_1_BM
+ 4v9e_1_AM
+ 4v9e_1_AA
+ 4v9e_1_BA
+ 4v9e_1_BG
+ 5lzs_1_II
+ 6fqr_1_C
+ 6ha1_1_X
+ 5kcr_1_1X
+ 2r1g_1_X
+ 3m7n_1_Z
+ 3m85_1_X
+ 3m85_1_Z
+ 3m85_1_Y
+ 1e8s_1_C
+ 5wnp_1_B
+ 5wnv_1_B
+ 5yts_1_B
+ 1utd_1_6
+ 1utd_1_Z
+ 1utd_1_4
+ 1utd_1_7
+ 1utd_1_9
+ 1utd_1_5
+ 1utd_1_3
+ 1utd_1_2
+ 1utd_1_8
+ 1utd_1_1
+ 6n6i_1_C
+ 6n6i_1_D
+ 6n6a_1_D
+ 6ij2_1_F
+ 6ij2_1_G
+ 6ij2_1_H
+ 6ij2_1_E
+ 3u2e_1_D
+ 3u2e_1_C
+ 5uef_1_C
+ 5uef_1_D
+ 4x4u_1_H
+ 4afy_1_D
+ 6oy5_1_I
+ 6owl_1_B
+ 6owl_1_C
+ 4afy_1_C
+ 4lq3_1_R
+ 6s0m_1_C
+ 6gx6_1_B
+ 4k4s_1_D
+ 4k4s_1_H
+ 4k4t_1_H
+ 4k4t_1_D
+ 1zn1_1_C
+ 1zn0_1_C
+ 1xpu_1_G
+ 1xpu_1_L
+ 1xpr_1_L
+ 1xpu_1_H
+ 1xpo_1_K
+ 1xpo_1_J
+ 1xpu_1_J
+ 1xpo_1_H
+ 1xpr_1_J
+ 1xpu_1_K
+ 1xpr_1_K
+ 1xpo_1_M
+ 1xpo_1_L
+ 1xpu_1_M
+ 1xpr_1_M
+ 1xpo_1_G
+ 1xpr_1_H
+ 1xpr_1_G
+ 6gc5_1_F
+ 6gc5_1_H
+ 6gc5_1_G
+ 4v7e_1_AA
+ 4v7e_1_AC
+ 1n1h_1_B
+ 4ohz_1_B
+ 6t83_1_6B
+ 4gv6_1_C
+ 4gv6_1_B
+ 4gv3_1_C
+ 4gv3_1_B
+ 4gv9_1_E
+ 6i7o_1_L
+ 2a8v_1_D
+ 6qx3_1_G
+ 2xnr_1_C
+ 4gkj_1_W
+ 4v5z_1_BC
+ 4v5z_1_BB
+ 4v5z_1_BH
+ 3j0o_1_F
+ 3j0l_1_F
+ 3j0p_1_F
+ 3j0q_1_F
+ 3j0o_1_B
+ 3j0l_1_B
+ 3j0o_1_C
+ 3j0l_1_C
+ 3j0q_1_C
+ 3j0p_1_C
+ 3j0o_1_A
+ 3j0l_1_A
+ 3j0q_1_A
+ 3j0p_1_A
+ 1cwp_1_D
+ 4v5z_1_BJ
+ 5sze_1_C
+ 6wre_1_D
+ 6i0u_1_B
+ 5zsa_1_C
+ 5zsa_1_D
+ 1n34_1_Z
+ 3pf5_1_S
+ 6ppn_1_A
+ 6ppn_1_I
+ 6qdw_1_V
+ 5hk0_1_F
+ 4qm6_1_D
+ 4qm6_1_C
+ 4jzu_1_C
+ 4jzv_1_C
+ 5ytv_1_B
+ 4k4z_1_P
+ 4k4z_1_D
+ 4k4x_1_L
+ 4k4z_1_L
+ 4k4x_1_D
+ 4k4z_1_H
+ 4k4x_1_H
+ 4k4x_1_P
+ 1t1m_1_A
+ 1t1m_1_B
+ 4a3b_1_P
+ 4a3m_1_P
+ 6u6y_1_E
+ 6u6y_1_G
+ 6u6y_1_F
+ 6u6y_1_H
+ 6qik_1_X
+ 6rzz_1_X
+ 6ri5_1_X
+ 6qt0_1_X
+ 6qtz_1_X
+ 6s05_1_X
+ 6t83_1_BB
+ 6t83_1_4B
+ 5fl8_1_Z
+ 5jcs_1_Z
+ 5mrc_1_BB
+ 5mre_1_BB
+ 5mrf_1_BB
+ 6gz4_1_BW
+ 3j46_1_P
+ 3jcr_1_M
+ 4e6b_1_A
+ 4e6b_1_B
+ 6a6l_1_D
+ 4v5z_1_BS
+ 4v8t_1_1
+ 1uvi_1_D
+ 1uvi_1_F
+ 1uvi_1_E
+ 4m7d_1_P
+ 4k4u_1_D
+ 4k4u_1_H
+ 6rt7_1_E
+ 6rt7_1_A
+ 2voo_1_C
+ 2voo_1_D
+ 5k78_1_X
+ 5k78_1_Y
+ 4ylo_1_9
+ 4kzy_1_I
+ 4kzz_1_I
+ 4kzx_1_I
+ 5vyc_1_I2
+ 5vyc_1_I3
+ 5vyc_1_I5
+ 5vyc_1_I1
+ 5vyc_1_I6
+ 5vyc_1_I4
+ 6ip8_1_2M
+ 6ip5_1_2M
+ 6ip6_1_2M
+ 6qcs_1_M
+ 486d_1_G
+ 2r1g_1_C
+ 486d_1_F
+ 4v5z_1_B0
+ 4nia_1_O
+ 4nia_1_J
+ 4nia_1_K
+ 4nia_1_L
+ 4nia_1_F
+ 4oq9_1_K
+ 4oq9_1_O
+ 4oq9_1_J
+ 4oq9_1_F
+ 4oq9_1_L
+ 5tbw_1_SR
+ 6hhq_1_SR
+ 6zvi_1_H
+ 6sv4_1_2B
+ 6sv4_1_2C
+ 6t83_1_2B
+ 6t83_1_A
+ 6i7o_1_2B
+ 6r9q_1_B
+ 6v3a_1_SN1
+ 6v3b_1_SN1
+ 6v39_1_SN1
+ 6v3e_1_SN1
+ 1pn7_1_C
+ 1mj1_1_Q
+ 1mj1_1_R
+ 4dr6_1_V
+ 6kql_1_I
+ 4eya_1_M
+ 4eya_1_N
+ 4eya_1_A
+ 4eya_1_B
+ 2wj8_1_D
+ 2wj8_1_I
+ 2wj8_1_L
+ 2wj8_1_F
+ 2wj8_1_C
+ 2wj8_1_Q
+ 2wj8_1_J
+ 2wj8_1_P
+ 2wj8_1_K
+ 2wj8_1_E
+ 2wj8_1_T
+ 2wj8_1_B
+ 2wj8_1_O
+ 2wj8_1_N
+ 2wj8_1_A
+ 2wj8_1_H
+ 2wj8_1_R
+ 2wj8_1_M
+ 2wj8_1_S
+ 2wj8_1_G
+ 4e6b_1_E
+ 4e6b_1_F
+ 6p71_1_I
+ 3pdm_1_R
+ 5det_1_P
+ 5els_1_I
+ 4n2s_1_B
+ 4yoe_1_E
+ 3j0o_1_H
+ 3j0l_1_H
+ 3j0p_1_H
+ 3j0q_1_H
+ 5gxi_1_B
+ 3iy8_1_A
+ 6tnu_1_M
+ 5mc6_1_M
+ 5mc6_1_N
+ 4eya_1_O
+ 4eya_1_P
+ 4eya_1_C
+ 4eya_1_D
+ 6htq_1_V
+ 6htq_1_W
+ 6htq_1_U
+ 6uu6_1_333
+ 6v3a_1_V
+ 6v39_1_V
+ 5a0v_1_F
+ 3avt_1_T
+ 6d1v_1_C
+ 4s2x_1_B
+ 4s2y_1_B
+ 5wnu_1_B
+ 1zc8_1_F
+ 1vtm_1_R
+ 4v5z_1_BA
+ 4v5z_1_BE
+ 4v5z_1_BD
+ 4v5z_1_BG
+ 4v5z_1_BI
+ 4v5z_1_BK
+ 4v5z_1_BM
+ 4v5z_1_BL
+ 4v5z_1_BV
+ 4v5z_1_BO
+ 4v5z_1_BN
+ 4v5z_1_BQ
+ 4v5z_1_BR
+ 4v5z_1_BT
+ 4v5z_1_BU
+ 4v5z_1_BW
+ 4v5z_1_BY
+ 4v5z_1_BX
+ 4v5z_1_BZ
+ 6u9x_1_H
+ 6u9x_1_K
+ 5elk_1_R
+ 6okk_1_G
+ 4cxg_1_A
+ 4cxh_1_A
+ 6bk8_1_I
+ 4cxg_1_B
+ 4cxh_1_B
+ 4v5z_1_B1
+ 5z4d_1_B
+ 6o78_1_E
+ 6ha8_1_X
+ 1m8w_1_E
+ 1m8w_1_F
+ 5udi_1_B
+ 5udl_1_B
+ 5udk_1_B
+ 5udj_1_B
+ 5w5i_1_B
+ 5w5i_1_D
+ 5w5h_1_B
+ 5w5h_1_D
+ 4eya_1_K
+ 4eya_1_L
+ 4eya_1_I
+ 4eya_1_J
+ 4g9z_1_E
+ 4g9z_1_F
+ 3nma_1_B
+ 3nma_1_C
+ 6een_1_G
+ 6een_1_I
+ 6een_1_H
+ 4wti_1_T
+ 4wti_1_P
+ 5l3p_1_Y
+ 4hor_1_X
+ 3rzo_1_R
+ 2f4v_1_Z
+ 1qln_1_R
+ 2xs7_1_B
+ 6zvi_1_E
+ 6sv4_1_MC
+ 6sv4_1_MB
+ 6i7o_1_MB
+ 6ogy_1_M
+ 6ogy_1_N
+ 6uej_1_B
+ 1x18_1_A
+ 5ytx_1_B
+ 6o8w_1_U
+ 4g0a_1_H
+ 6r9p_1_B
+ 3koa_1_C
+ 4n48_1_D
+ 4n48_1_G
+ 6kug_1_B
+ 6ktc_1_V
+ 6ole_1_U
+ 6om0_1_U
+ 6olg_1_BV
+ 6oli_1_U
+ 6om7_1_U
+ 6w6l_1_U
+ 6olz_1_BV
+ 6olf_1_U
+ 5lzd_1_X
+ 6m7k_1_B
+ 3cd6_1_4
+ 3cma_1_5
+ 6n9e_1_2W
+ 1vqo_1_4
+ 1qvg_1_3
+ 3cme_1_5
+ 5lzd_1_W
+ 5lze_1_W
+ 5lzc_1_W
+ 5lzb_1_W
+ 3wzi_1_C
+ 1mvr_1_E
+ 1mvr_1_B
+ 1mvr_1_A
+ 4adx_1_0
+ 4adx_1_8
+ 1n33_1_Z
+ 6dti_1_W
+ 3d2s_1_F
+ 3d2s_1_H
+ 5mrc_1_AA
+ 5mre_1_AA
+ 5mrf_1_AA
+ 5fl8_1_Y
+ 5jcs_1_Y
+ 2r1g_1_A
+ 2r1g_1_D
+ 2r1g_1_F
+ 3eq4_1_Y
+ 4wkr_1_C
+ 4v99_1_EC
+ 4v99_1_AC
+ 4v99_1_BH
+ 4v99_1_CH
+ 4v99_1_AM
+ 4v99_1_DC
+ 4v99_1_JW
+ 4v99_1_EH
+ 4v99_1_BW
+ 4v99_1_FW
+ 4v99_1_AW
+ 4v99_1_BC
+ 4v99_1_BM
+ 4v99_1_IC
+ 4v99_1_EM
+ 4v99_1_ER
+ 4v99_1_IW
+ 4v99_1_JH
+ 4v99_1_JR
+ 4v99_1_AH
+ 4v99_1_GR
+ 4v99_1_IR
+ 4v99_1_BR
+ 4v99_1_CW
+ 4v99_1_HR
+ 4v99_1_FH
+ 4v99_1_HC
+ 4v99_1_DW
+ 4v99_1_GC
+ 4v99_1_JC
+ 4v99_1_DM
+ 4v99_1_EW
+ 4v99_1_AR
+ 4v99_1_CR
+ 4v99_1_JM
+ 4v99_1_CC
+ 4v99_1_IH
+ 4v99_1_FR
+ 4v99_1_CM
+ 4v99_1_IM
+ 4v99_1_FM
+ 4v99_1_FC
+ 4v99_1_GH
+ 4v99_1_HM
+ 4v99_1_HH
+ 4v99_1_DR
+ 4v99_1_HW
+ 4v99_1_GW
+ 4v99_1_DH
+ 4v99_1_GM
+ 6rt4_1_D
+ 6rt4_1_C
+ 6zvh_1_X
+ 4dwa_1_D
+ 6n6c_1_D
+ 6n6j_1_C
+ 6n6j_1_D
+ 6p7q_1_E
+ 6p7q_1_F
+ 6p7q_1_D
+ 6rcl_1_C
+ 5jju_1_C
+ 4ejt_1_G
+ 5ceu_1_C
+ 5ceu_1_D
+ 6lkq_1_W
+ 3qsu_1_P
+ 3qsu_1_R
+ 1n38_1_B
+ 4qvc_1_G
+ 6q1h_1_D
+ 6q1h_1_H
+ 6p7p_1_F
+ 6p7p_1_E
+ 6p7p_1_D
+ 6vm6_1_J
+ 6vm6_1_G
+ 6wan_1_K
+ 6wan_1_H
+ 6wan_1_G
+ 6wan_1_L
+ 6wan_1_I
+ 6ywo_1_F
+ 6wan_1_J
+ 4oau_1_A
+ 6ywo_1_E
+ 6ywo_1_K
+ 6vm6_1_I
+ 6vm6_1_H
+ 6ywo_1_I
+ 2a1r_1_C
+ 2a1r_1_D
+ 3gpq_1_E
+ 3gpq_1_F
+ 6o79_1_C
+ 6vm6_1_K
+ 6hyu_1_D
+ 1laj_1_R
+ 6ybv_1_K
+ 6mpf_1_W
+ 6spc_1_A
+ 6spe_1_A
+ 6fti_1_V
+ 6ftj_1_V
+ 6ftg_1_V
+ 4g0a_1_G
+ 4g0a_1_F
+ 4g0a_1_E
+ 2b2d_1_S
+ 5hkc_1_C
+ 1rmv_1_B
+ 4qu7_1_X
+ 4qu7_1_V
+ 4qu7_1_U
+ 4v5z_1_AH
+ 4v5z_1_AA
+ 4v5z_1_AB
+ 4v5z_1_AC
+ 4v5z_1_AD
+ 4v5z_1_AE
+ 4v5z_1_AF
+ 4v5z_1_AG
+ 6pmi_1_3
+ 6pmj_1_3
+ 5hjz_1_C
--- a/known_issues_reasons.txt
View file @7196427
+++ b/known_issues_reasons.txt
View file @7196427
--- a/regression.py
View file @7196427
+++ b/regression.py
View file @7196427
@@ -11,7 +11,7 @@
 #   - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
 #   - Use cmalign --small everywhere (homogeneity)
 # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. 
- # To date, we trust Infernal as the best tool to realign RNA. Is it ?
+ # To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
 
 # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
 
@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
 LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"]   # From Rfam CLAN 00112
 SSU_set = ["RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"]  # From Rfam CLAN 00111
 
- with sqlite3.connect("results/RNANet.db") as conn:
+ with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
     df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
 
 to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
 ax.set_zlabel("Computation time (s)")
 
 plt.subplots_adjust(wspace=0.4)
- plt.savefig("results/cmalign_jobs_performance.png")
+ plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
 
 # # ========================================================
 # # Linear Regression of max_mem as function of max_length
--- a/statistics.py
View file @7196427
+++ b/statistics.py
View file @7196427
@@ -3,7 +3,6 @@
 # This file computes additional statistics over the produced dataset.
 # Run this file if you want the base counts, pair-type counts, identity percents, etc
 # in the database.
- # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
 
 import getopt, os, pickle, sqlite3, shlex, subprocess, sys
 import numpy as np
@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
 from collections import Counter
- from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
+ from setproctitle import setproctitle
+ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
 
 path_to_3D_data = "tobedefinedbyoptions"
 path_to_seq_data = "tobedefinedbyoptions"
+ runDir = os.getcwd()
 res_thr = 20.0 # default: all structures
 
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
 
- def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
+ @trace_unhandled_exceptions
+ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
-     See Wadley & Pyle (2007)
+     See Wadley & Pyle (2007).
+     Only unique unmapped chains with resolution < res argument are considered.
 
     Arguments:
-     show: True or False, call plt.show() at this end or not
-     filter_helical: None, "form", "zone", or "both"
-                     None: do not remove helical nucleotide
-                     "form": remove nucleotides if they belong to a A, B or Z form stem
-                     "zone": remove nucleotides falling in an arbitrary zone (see zone argument)
-                     "both": remove nucleotides fulfilling one or both of the above conditions
     carbon:     1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
+     show:       True or False, call plt.show() at this end or not
     sd_range:   tuple, set values below avg + sd_range[0] * stdev to 0,
                     and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
                     This removes noise and cuts too high peaks, to clearly see the clusters.
+     res:        Minimal resolution (maximal resolution value, actually) of the structure to 
+                     consider its nucleotides.
     """
 
-     os.makedirs("results/figures/wadley_plots/", exist_ok=True)
+     os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True)
 
     if carbon == 4:
         angle = "eta"
@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         exit("You overestimate my capabilities !")
 
     
-     if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
+     if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"):
 
         # Get a worker number to position the progress bar
         global idxQueue
         thr_idx = idxQueue.get()
+         setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
+ 
         pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
 
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
-         with sqlite3.connect("results/RNANet.db") as conn:
+         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
-                                  FROM nucleotide JOIN (
-                                     SELECT chain_id FROM chain JOIN structure
-                                     WHERE structure.resolution <= {res}
-                                  ) AS c
+                                  FROM (
+                                     SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
+                                     WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
+                                  ) AS c NATURAL JOIN nucleotide
                                  WHERE puckering="C2'-endo" 
                                     AND {angle} IS NOT NULL 
                                     AND th{angle} IS NOT NULL;""", conn)
             c2_endo_etas = df[angle].values.tolist()
             c2_endo_thetas = df["th"+angle].values.tolist()
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
-                                  FROM nucleotide JOIN (
-                                     SELECT chain_id FROM chain JOIN structure
-                                     WHERE structure.resolution <= {res}
-                                  ) AS c
+                                  FROM (
+                                     SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
+                                     WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
+                                  ) AS c NATURAL JOIN nucleotide 
                                  WHERE form = '.' 
                                     AND puckering="C3'-endo" 
                                     AND {angle} IS NOT NULL 
@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         pbar.update(1)
 
         # Save the data to an archive for later use without the need to recompute
-         np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
+         np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
         pbar.close()
         idxQueue.put(thr_idx)
     else:
-         f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
+         setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})")
+ 
+         f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz")
         c2_endo_etas = f["c2_endo_e"]
         c3_endo_etas = f["c3_endo_e"]
         c2_endo_thetas = f["c2_endo_t"]
@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         f_low_thr = f.mean() + sd_range[0]*f.std()
         f_cut = np.where(f > f_sup_thr, f_sup_thr, f)
         f_cut = np.where(f_cut < f_low_thr, 0, f_cut)
-         levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
+         levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
 
         # histogram:
         fig = plt.figure()
@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
+         fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
+         fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
         fig = plt.figure(figsize=(5,5))
         ax = fig.gca()
         ax.scatter(x, y, s=1, alpha=0.1)
-         ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
+         ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
+         fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
 
 def stats_len():
     """Plots statistics on chain lengths in RNA families.
+     Uses all chains mapped to a family including copies, inferred or not.
     
     REQUIRES tables chain, nucleotide up to date.
     """
 
+     setproctitle(f"RNANet statistics.py stats_len({res_thr})")
+     
     # Get a worker number to position the progress bar
     global idxQueue
     thr_idx = idxQueue.get()
@@ -214,7 +221,7 @@ def stats_len():
     cols = []
     lengths = []
     
-     for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
+     for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
 
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -229,7 +236,7 @@ def stats_len():
             cols.append("grey")
 
         # Get the lengths of chains
-         with sqlite3.connect("results/RNANet.db") as conn:
+         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 
                                                             FROM (
                                                                 SELECT chain_id 
@@ -239,8 +246,6 @@ def stats_len():
                                                             GROUP BY chain_id;""", warn_every=0) ]
         lengths.append(l) # list of chain lengths from the family
 
-         # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
- 
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
     ax = fig.gca()
@@ -267,7 +272,7 @@ def stats_len():
                 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
 
     # Save the figure
-     fig.savefig(f"results/figures/lengths_{res_thr}A.png")
+     fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
     idxQueue.put(thr_idx) # replace the thread index in the queue
     # notify("Computed sequence length statistics and saved the figure.")
 
@@ -285,6 +290,7 @@ def format_percentage(tot, x):
 
 def stats_freq():
     """Computes base frequencies in all RNA families.
+     Uses all chains mapped to a family including copies, inferred or not.
 
     Outputs results/frequencies.csv
     REQUIRES tables chain, nucleotide up to date."""
@@ -293,17 +299,18 @@ def stats_freq():
     global idxQueue
     thr_idx = idxQueue.get()
 
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()")
+ 
     # Initialize a Counter object for each family
     freqs = {}
     for f in fam_list:
         freqs[f] = Counter()
 
     # List all nt_names happening within a RNA family and store the counts in the Counter
-     for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
-         with sqlite3.connect("results/RNANet.db") as conn:
+     for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
+         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
-         # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
     
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
@@ -311,7 +318,7 @@ def stats_freq():
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
-     df.to_csv("results/frequencies.csv")    
+     df.to_csv(runDir + "/results/frequencies.csv")    
     idxQueue.put(thr_idx) # replace the thread index in the queue
     # notify("Saved nucleotide frequencies to CSV file.")
 
@@ -327,11 +334,13 @@ def parallel_stats_pairs(f):
     global idxQueue
     thr_idx = idxQueue.get()
 
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})")
+ 
     chain_id_list = mappings_list[f]
     data = []
     sqldata = []
     for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
-         with sqlite3.connect("results/RNANet.db") as conn:
+         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
             interactions = pd.DataFrame(
                             sql_ask_database(conn, 
@@ -398,7 +407,7 @@ def parallel_stats_pairs(f):
         data.append(expanded_list)
 
     # Update the database
-     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+     with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
         conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
         sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
                                 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
@@ -416,8 +425,8 @@ def parallel_stats_pairs(f):
 
     # Create an output DataFrame
     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
-     f_df.to_csv(f"data/{f}_counts.csv")
-     expanded_list.to_csv(f"data/{f}_pairs.csv")
+     f_df.to_csv(runDir + f"/data/{f}_counts.csv")
+     expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
     
     idxQueue.put(thr_idx) # replace the thread index in the queue
 
@@ -430,28 +439,34 @@ def to_dist_matrix(f):
     global idxQueue
     thr_idx = idxQueue.get()
 
-     # notify(f"Computing {f} distance matrix from alignment...")
-     command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
 
     # Prepare a file
     with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
         al = AlignIO.read(al_file, "fasta")
         names = [ x.id for x in al if '[' in x.id ]
         al = al[-len(names):]
-     with open(f + "_3d_only.stk", "w") as only_3d:
+     with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
+         try:
             only_3d.write(al.format("stockholm"))
+         except ValueError as e:
+             warn(e)
     del al
+     subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm",  path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
 
     # Prepare the job
-     process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
+     process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     id_matrix = np.zeros((len(names), len(names)))
 
     pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
-     while process.poll() is None:
-         output = process.stdout.readline()
+     cnt = 0
+     while not cnt or process.poll() is None:
+         output = process.stdout.read()
         if output:
             lines = output.strip().split(b'\n')
             for l in lines:
+                 cnt += 1
                 line = l.split()
                 s1 = line[0].decode('utf-8')
                 s2 = line[1].decode('utf-8')
@@ -460,9 +475,14 @@ def to_dist_matrix(f):
                 id2 = names.index(s2)
                 id_matrix[id1, id2] = float(score)
                 pbar.update(1)
+     if cnt != len(names)*(len(names)-1)*0.5:
+         warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}")
+     if process.poll() != 0:
+         l = process.stderr.read().strip().split(b'\n')
+         warn("\n".join([ line.decode('utf-8') for line in l ]))
     pbar.close()
 
-     subprocess.run(["rm", "-f", f + "_3d_only.stk"])
+     subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
     np.save("data/"+f+".npy", id_matrix)
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
@@ -471,21 +491,26 @@ def seq_idty():
     """Computes identity matrices for each of the RNA families.
     
     REQUIRES temporary results files in data/*.npy
-     REQUIRES tables chain, family un to date."""
+     REQUIRES tables chain, family up to date."""
 
     # load distance matrices
+     fams_to_plot = [ f for f in famlist if f not in ignored ]
     fam_arrays = []
-     for f in famlist:
+     for f in fams_to_plot:
         if path.isfile("data/"+f+".npy"):
-             fam_arrays.append(np.load("data/"+f+".npy"))
+             fam_arrays.append(np.load("data/"+f+".npy") / 100.0)  # normalize percentages in [0,1]
         else:
-             fam_arrays.append([])
+             warn("data/"+f+".npy not found !")
+             fam_arrays.append(np.array([]))
 
     # Update database with identity percentages
-     conn = sqlite3.connect("results/RNANet.db")
-     for f, D in zip(famlist, fam_arrays):
+     conn = sqlite3.connect(runDir + "/results/RNANet.db")
+     for f, D in zip(fams_to_plot, fam_arrays):
         if not len(D): continue
-         a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
+         if D.shape[0] > 1:
+             a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1)    # SUM(D) / (n(n-1)/2)
+         else:
+             a = D[0][0]
         conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
     conn.commit()
     conn.close()
@@ -495,10 +520,11 @@ def seq_idty():
     axs = axs.ravel()
     [axi.set_axis_off() for axi in axs]
     im = "" # Just to declare the variable, it will be set in the loop
-     for f, D, ax in zip(famlist, fam_arrays, axs):
-         if not len(D): continue
-         if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+     for f, D, ax in zip(fams_to_plot, fam_arrays, axs):
         D = D + D.T         # Copy the lower triangle to upper, to get a symetrical matrix
+         if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+             D = 1.0 - D
+             np.fill_diagonal(D, 0.0)
             condensedD = squareform(D)
 
             # Compute basic dendrogram by Ward's method
@@ -507,15 +533,20 @@ def seq_idty():
 
             # Reorganize rows and cols
             idx1 = Z['leaves']
-             D = D[idx1,:]
+             D = D[idx1[::-1],:]
             D = D[:,idx1[::-1]]
-         im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
-         ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
+             D = 1.0 - D
+         elif D.shape[0] == 2:
+             np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now
+         ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)", 
+                 fontsize=9, horizontalalignment = 'center', verticalalignment='bottom')
+         im = ax.matshow(D, vmin=0, vmax=1)
+ 
     fig.tight_layout()
-     fig.subplots_adjust(wspace=0.1, hspace=0.3)
-     fig.colorbar(im, ax=axs[-1], shrink=0.8)
-     fig.savefig(f"results/figures/distances.png")
-     notify("Computed all identity matrices and saved the figure.")
+     fig.subplots_adjust(hspace=0.3, wspace=0.1)
+     fig.colorbar(im, ax=axs[-4], shrink=0.8)
+     fig.savefig(runDir + f"/results/figures/distances.png")
+     print("> Computed all identity matrices and saved the figure.", flush=True)
 
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -523,6 +554,8 @@ def stats_pairs():
     Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
     REQUIRES tables chain, nucleotide up-to-date.""" 
     
+     setproctitle(f"RNANet statistics.py stats_pairs()")
+ 
     def line_format(family_data):
         return family_data.apply(partial(format_percentage, sum(family_data)))
 
@@ -530,12 +563,12 @@ def stats_pairs():
         results = []
         allpairs = []
         for f in fam_list:
-             newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
-             fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
+             newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
+             fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
             results.append(fam_df)
             allpairs.append(newpairs)
-             subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
-             subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
+             subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"])
+             subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
         df.to_csv("data/pair_counts.csv")
@@ -573,14 +606,14 @@ def stats_pairs():
     crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
 
     # Save to CSV
-     df.to_csv("results/pair_types.csv")
+     df.to_csv(runDir + "/results/pair_types.csv")
 
     # Plot barplot of overall types
     ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
     ax.set_ylabel("Number of observations (millions)", fontsize=13)
     ax.set_xlabel(None)
     plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
-     plt.savefig("results/figures/pairings.png")
+     plt.savefig(runDir + "/results/figures/pairings.png")
 
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
 
@@ -589,7 +622,9 @@ def per_chain_stats():
 
     REQUIRES tables chain, nucleotide up to date. """
     
-     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+     setproctitle(f"RNANet statistics.py per_chain_stats()")
+ 
+     with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -600,25 +635,36 @@ def per_chain_stats():
         conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
-     notify("Updated the database with per-chain base frequencies")
+     print("> Updated the database with per-chain base frequencies", flush=True)
 
 def general_stats():
     """
     Number of structures as function of the resolution threshold
     Number of Rfam families as function of the resolution threshold
     """
-     with sqlite3.connect("results/RNANet.db") as conn:
-         df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+ 
+     setproctitle(f"RNANet statistics.py general_stats()")
+ 
+     reqs = [
+         # unique unmapped chains with no issues
+         """ SELECT distinct pdb_id, chain_name, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                         WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
-         df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+             WHERE rfam_acc = 'unmappd' AND ISSUE=0;""",
+ 
+         # unique mapped chains with no issues
+         """ SELECT distinct pdb_id, chain_name, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                             WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
-         df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
+             WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
+ 
+         # mapped chains with no issues
+         """ SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                             WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
-         df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
-                                                     FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
+             WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
+ 
+         # mapped chains with no issues that are all inferred
+         """ SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
+             FROM (
+                 SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
                 FROM chain 
                 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 
                         FROM chain 
@@ -627,8 +673,105 @@ def general_stats():
                 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 
                 WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
             ) AS c
-                                                     JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
-     print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
+             JOIN structure ON c.structure_id=structure.pdb_id;""",
+ 
+         # Number of mapped chains (not inferred)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
+ 
+         # Number of unique mapped chains (not inferred)
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
+ 
+         # Number of mapped chains (inferred)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
+ 
+         # Number of unique mapped chains (inferred)
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
+ 
+         # Number of mapped chains inferred once
+         """SELECT count(*) FROM (
+                 SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c 
+                 FROM chain where rfam_acc!='unmappd' and inferred=1 
+                 GROUP BY structure_id, chain_name
+             ) WHERE c=1;""",
+ 
+         # Number of mapped chains inferred twice
+         """select count(*) from (
+                 select structure_id, chain_name, count(distinct rfam_acc) as c 
+                 from chain where rfam_acc!='unmappd' and inferred=1 
+                 group by structure_id, chain_name
+             ) where c=2;""",
+ 
+         # Number of mapped chains inferred 3 times or more
+         """select count(*) from (
+                 select structure_id, chain_name, count(distinct rfam_acc) as c 
+                 from chain where rfam_acc!='unmappd' and inferred=1 
+                 group by structure_id, chain_name
+             ) where c>2;""",
+ 
+         # Number of chains both mapped with and without inferrence
+         """ SELECT COUNT(*) FROM (
+                 SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c 
+                 FROM chain 
+                 WHERE rfam_acc!='unmappd' 
+                 GROUP BY structure_id, chain_name
+             ) 
+             WHERE s < c AND s > 0;""",
+         
+         # Number of mapped chains (total)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
+ 
+         # Number of unique mapped chains
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
+ 
+         # Number of unmapped chains
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""",
+         
+         # Number of mapped chains without issues (not inferred)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
+ 
+         # Number of unique mapped chains without issues (not inferred)
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
+ 
+         # Number of mapped chains without issues (inferred)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
+ 
+         # Number of unique mapped chains without issues (inferred)
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
+ 
+         # Number of mapped chains without issues (total)
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
+ 
+         # Number of unique mapped chains without issues
+         """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
+ 
+         # Number of unmapped chains without issues
+         """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);"""
+     ]
+ 
+     answers = []
+     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+         for r in reqs:
+             answers.append(pd.read_sql(r, conn))
+     df_unique = answers[0]
+     df_mapped_unique = answers[1]
+     df_mapped_copies = answers[2]
+     df_inferred_only_unique = answers[3]
+     print()
+     print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
+     if answers[4].iloc[0][0] != answers[5].iloc[0][0]:
+         print("\t> This happens because different parts of the same chain can be mapped to different families.")
+     print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
+     print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,")
+     print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,")
+     print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.")
+     print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.")
+     if answers[11].iloc[0][0]:
+         print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
+     print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
+     print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
+     if answers[14].iloc[0][0]:
+         print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
+     print()
 
     ##########################################
     # plot N = f(resolution, exp_method)
@@ -642,7 +785,7 @@ def general_stats():
     df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
     df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
     max_res = max(df_unique.resolution)
-     max_structs = len(df_mapped_copies.index.tolist())
+     max_structs = max(len(df_mapped_copies.index), len(df_unique.index))
     colors = np.linspace(0,1,1+len(methods))
     plt.xticks( np.arange(0, max_res+2, 2.0).tolist(),  np.arange(0, max_res+2, 2.0).tolist() )
 
@@ -654,7 +797,7 @@ def general_stats():
     axs[0][0].set_ylabel("ALL", fontsize=14)
     axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
     axs[0][0].set_ylim((0, max_structs * 1.05))
-     axs[0][0].legend(loc="best", fontsize=14)
+     axs[0][0].legend(loc="lower right", fontsize=14)
 
     axs[0][1].grid(axis='y', ls='dotted', lw=1)
     axs[0][1].set_yticklabels([])
@@ -663,9 +806,9 @@ def general_stats():
     axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
     axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " %  len(df_mapped_unique.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
-     axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
+     axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
     axs[0][1].set_ylim((0, max_structs * 1.05))
-     axs[0][1].legend(loc="best", fontsize=14)
+     axs[0][1].legend(loc="upper left", fontsize=14)
 
     axs[0][2].grid(axis='y', ls='dotted', lw=1)
     axs[0][2].set_yticklabels([])
@@ -675,7 +818,7 @@ def general_stats():
     axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " %  len(df_mapped_copies.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
     axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
-     axs[0][2].legend(loc="right", fontsize=14)
+     axs[0][2].legend(loc="upper left", fontsize=14)
     axs[0][2].set_ylim((0, max_structs * 1.05))
 
     for i,m in enumerate(methods):
@@ -683,7 +826,7 @@ def general_stats():
         df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
         df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
         df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
-         max_structs = len(df_mapped_copies_m.resolution.tolist())
+         max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index))
         print("> found", max_structs, "structures with method", m, flush=True)
 
         axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
@@ -693,7 +836,7 @@ def general_stats():
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][0].set_ylim((0, max_structs * 1.05))
         axs[1+i][0].set_ylabel(m, fontsize=14)
-         axs[1+i][0].legend(loc="best", fontsize=14)
+         axs[1+i][0].legend(loc="lower right", fontsize=14)
 
         axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
         axs[1+i][1].set_yticklabels([])
@@ -703,7 +846,7 @@ def general_stats():
         axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " %  len(df_mapped_unique_m.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][1].set_ylim((0, max_structs * 1.05))
-         axs[1+i][1].legend(loc="best", fontsize=14)
+         axs[1+i][1].legend(loc="upper left", fontsize=14)
         
         axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
         axs[1+i][2].set_yticklabels([])
@@ -713,7 +856,7 @@ def general_stats():
         axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " %  len(df_mapped_copies_m.resolution), 
                          horizontalalignment='right', verticalalignment='top', fontsize=14)
         axs[1+i][2].set_ylim((0, max_structs * 1.05))
-         axs[1+i][2].legend(loc="right", fontsize=14)
+         axs[1+i][2].legend(loc="upper left", fontsize=14)
     
     axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
     axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
@@ -722,7 +865,7 @@ def general_stats():
     fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
     fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, 
                         hspace=0.05, bottom=0.05, top=0.92)
-     fig.savefig("results/figures/resolutions.png")
+     fig.savefig(runDir + "/results/figures/resolutions.png")
     plt.close()
 
     ##########################################
@@ -765,7 +908,7 @@ def general_stats():
     fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
     fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, 
                         hspace=0.05, bottom=0.12, top=0.84)
-     fig.savefig("results/figures/Nfamilies.png")
+     fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
 
 def log_to_pbar(pbar):
@@ -776,8 +919,10 @@ def log_to_pbar(pbar):
 if __name__ == "__main__":
 
     # parse options
+     DELETE_OLD_DATA = False
+     DO_WADLEY_ANALYSIS = False
     try:
-         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
+         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
     except getopt.GetoptError as err:
         print(err)
         sys.exit(2)
@@ -795,6 +940,7 @@ if __name__ == "__main__":
                     "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
             print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
                     "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+             print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
             sys.exit()
         elif opt == '--version':
             print("RNANet statistics 1.1 beta")
@@ -810,25 +956,37 @@ if __name__ == "__main__":
             path_to_seq_data = path.abspath(arg)
             if path_to_seq_data[-1] != '/':
                 path_to_seq_data += '/'
+         elif opt=='--from-scratch':
+             DELETE_OLD_DATA = True
+             DO_WADLEY_ANALYSIS = True
+             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
+         elif opt=='--wadley':
+             DO_WADLEY_ANALYSIS = True
     
 
     # Load mappings
     print("Loading mappings list...")
-     with sqlite3.connect("results/RNANet.db") as conn:
+     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
         mappings_list = {}
         for k in fam_list:
-             mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
+             mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
 
     # List the families for which we will compute sequence identity matrices
-     with sqlite3.connect("results/RNANet.db") as conn:
-         famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
-         ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
+     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+         famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
+         ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
+         n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
     if len(ignored):
         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
     
+     if DELETE_OLD_DATA:
+         for f in fam_list:
+             subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
+ 
+ 
     # Prepare the multiprocessing execution environment
-     nworkers = max(read_cpu_number()-1, 32)
+     nworkers = min(read_cpu_number()-1, 32)
     thr_idx_mgr = Manager()
     idxQueue = thr_idx_mgr.Queue()
     for i in range(nworkers):
@@ -836,14 +994,15 @@ if __name__ == "__main__":
 
     # Define the tasks
     joblist = []
-     # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0)))   # res threshold is 4.0 Angstroms by default
-     # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0)))   #
+     if n_unmapped_chains and DO_WADLEY_ANALYSIS:
+         joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0)))   # res threshold is 4.0 Angstroms by default
+         joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0)))   #
     joblist.append(Job(function=stats_len)) # Computes figures
-     # joblist.append(Job(function=stats_freq)) # updates the database
-     # for f in famlist:
-     #     joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
-     #     if f not in ignored:
-     #         joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
+     joblist.append(Job(function=stats_freq)) # updates the database
+     for f in famlist:
+         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
+         if f not in ignored:
+             joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
 
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
@@ -867,7 +1026,8 @@ if __name__ == "__main__":
     print()
 
     # finish the work after the parallel portions
-     # per_chain_stats()
-     # seq_idty()
-     # stats_pairs()
+     per_chain_stats()
+     seq_idty()
+     stats_pairs()
+     if n_unmapped_chains:
         general_stats()