Rethink how inferred mappings work

Louis BECQUEY
Commit b484b3f66c993ba102bd9161e161b61fcc35d18b b484b3f6 1 parent 3293b90c
Showing 4 changed files with 363 additions and 84 deletions
RNAnet.py
kill_rnanet.sh
recompute_some_chains.py
statistics.py
--- a/RNAnet.py
View file @b484b3f
+++ b/RNAnet.py
View file @b484b3f
@@ -18,7 +18,7 @@ from os import path, makedirs
 from multiprocessing import Pool, Manager, set_start_method
 from time import sleep
 from tqdm import tqdm
- from tqdm.contrib.concurrent import process_map
+ from setproctitle import setproctitle
 
 def trace_unhandled_exceptions(func):
     @wraps(func)
@@ -169,7 +169,8 @@ class Chain:
     def extract(self, df, khetatm):
         """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
         """
-         
+         setproctitle(f"RNANet.py {self.chain_label} extract()")
+ 
         if self.mapping is not None:
             status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
             self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
@@ -213,6 +214,8 @@ class Chain:
     def extract_3D_data(self, save_logs=True):
         """ Maps DSSR annotations to the chain. """
 
+         setproctitle(f"RNANet.py {self.chain_label} extract_3D_data()")
+         
         ############################################
         # Load the mmCIF annotations from file
         ############################################
@@ -311,6 +314,10 @@ class Chain:
                     # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
                     # Solution : we shift the numbering of 17A (to 18) and the following residues.
                     df.iloc[i:, 1] += 1
+                 elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3:
+                     # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
+                     df.iloc[1:, 1] += 1
+                     df.iloc[0, 1] = 0
                 else:
                     # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
                     # Here the 163B is read 163 by DSSR, but there already is a residue 163.
@@ -323,7 +330,6 @@ class Chain:
                 self.error_messages = f"Error with parsing of duplicate residues numbers."
                 return None
 
- 
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
         while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
@@ -338,7 +344,6 @@ class Chain:
                 self.mapping.log(df.tail(1))
             df = df.head(-1) 
 
- 
         # Duplicates in index_chain : drop, they are ligands
         # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
         duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
@@ -384,7 +389,7 @@ class Chain:
                     df.iloc[i+1:, 1] += 1
                 else:
                     warn(f"Missing index_chain {i} in {self.chain_label} !")
- 
+         
         # Assert some nucleotides still exist
         try:
             l = df.iloc[-1,1] - df.iloc[0,1] + 1    # update length of chain from nt_resnum point of view
@@ -522,6 +527,8 @@ class Chain:
         """Saves the extracted 3D data to the database.
         """
 
+         setproctitle(f"RNANet.py {self.chain_label} register_chain()")
+ 
         with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
             # Register the chain in table chain
             if self.mapping is not None:
@@ -575,6 +582,8 @@ class Chain:
         s_seq: the aligned version of self.seq_to_align
         """
 
+         setproctitle(f"RNANet.py {self.chain_label} remap()")
+ 
         alilen = len(s_seq)
         re_mappings = []
 
@@ -630,6 +639,8 @@ class Chain:
         REQUIRES align_column and re_mapping up to date
         """
 
+         setproctitle(f"RNANet.py {self.chain_label} replace_gaps()")
+ 
         homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
                                                     (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id})
                                                     NATURAL JOIN re_mapping
@@ -741,6 +752,9 @@ class Downloader:
         """Query the Rfam public MySQL database for mappings between their RNA families and PDB structures.
 
         """
+ 
+         setproctitle(f"RNANet.py download_Rfam_PDB_mappings()")
+ 
         # Download PDB mappings to Rfam family
         print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
         try:
@@ -766,6 +780,8 @@ class Downloader:
         Does not download if already there.
         """
 
+         setproctitle(f"RNANet.py download_Rfam_cm()")
+ 
         print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) 
         if not path.isfile(path_to_seq_data + "Rfam.cm"):
             try:
@@ -785,6 +801,9 @@ class Downloader:
         Family ID, number of sequences identified, maximum length of those sequences.
         SETS family in the database (partially)
         """
+         
+         setproctitle(f"RNANet.py download_Rfam_family_stats()")
+ 
         try: 
             db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
             
@@ -829,6 +848,8 @@ class Downloader:
 
         Actually gets a FASTA archive from the public Rfam FTP. Does not download if already there."""
 
+         setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
+ 
         if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
             for _ in range(10): # retry 100 times if it fails
                 try:
@@ -849,6 +870,9 @@ class Downloader:
 
         Does not remove structural redundancy.
         """
+ 
+         setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
+ 
         nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) 
         print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
         # Download latest BGSU non-redundant list
@@ -875,6 +899,10 @@ class Downloader:
         return full_structures_list # list of ( str (class), str (class_members) )
 
     def download_from_SILVA(self, unit):
+ 
+         setproctitle(f"RNANet.py download_from_SILVA({unit})")
+ 
+ 
         if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
             try:
                 print(f"Downloading {unit} from SILVA...", end='', flush=True)
@@ -989,6 +1017,8 @@ class Pipeline:
         global path_to_3D_data
         global path_to_seq_data
 
+         setproctitle("RNANet.py process_options()")
+ 
         try:
             opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 
                                     [   "help", "resolution=", "keep-hetatm=", "from-scratch",
@@ -1105,13 +1135,16 @@ class Pipeline:
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
             print("See RNANet.py --help for more information.")
             exit(1)
- 
+     
+     @trace_unhandled_exceptions
     def list_available_mappings(self):
         """List 3D chains with available Rfam mappings.
 
         Return a list of Chain() objects with the mappings set up.        
         If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
 
+         setproctitle("RNANet.py list_available_mappings()")
+ 
         # List all 3D RNA chains below given resolution
         full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
 
@@ -1131,11 +1164,11 @@ class Pipeline:
             # Compute the list of mappable structures using NR-list and Rfam-PDB mappings
             # And get Chain() objects
             print("> Building list of structures...", flush=True)
-             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
+             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores, maxtasksperchild=1)
             try:
 
-                 pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, bar_format="{percentage:3.0f}%|{bar}|")
-                 for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list)): 
+                 pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{percentage:3.0f}%|{bar}|")
+                 for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): 
                     self.update += newchains
                     pbar.update(1) # Everytime the iteration finishes, update the global progress bar
 
@@ -1161,7 +1194,7 @@ class Pipeline:
                     pdb_chain_id = nr[2].upper()
                     chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
                     res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
-                     if not len(res): # the chain is NOT yet in the database, or this is a known issue
+                     if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
                         self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
             conn.close()
 
@@ -1179,6 +1212,8 @@ class Pipeline:
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
         SETS table structure"""
 
+         setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
+ 
         # Prepare the results folders
         if not path.isdir(path_to_3D_data + "RNAcifs"):
             os.makedirs(path_to_3D_data + "RNAcifs")        # for the whole structures
@@ -1186,15 +1221,15 @@ class Pipeline:
             os.makedirs(path_to_3D_data + "annotations")    # for DSSR analysis of the whole structures
         
         # Download and annotate
-         print("> Downloading and annotating structures...", flush=True)
+         print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
         if retry:
             mmcif_list = sorted(set([ c.pdb_id for c in self.retry ]))
         else:
-             mmcif_list = sorted(set([ c.pdb_id for c in self.update if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json") ]))
+             mmcif_list = sorted(set([ c.pdb_id for c in self.update ]))
         try:
-             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
+             p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores), maxtasksperchild=1)
             pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
-             for _ in p.imap_unordered(work_mmcif, mmcif_list): 
+             for _ in p.imap_unordered(work_mmcif, mmcif_list, chunksize=1): 
                 pbar.update(1) # Everytime the iteration finishes, update the global progress bar
             pbar.close()
             p.close()
@@ -1213,6 +1248,8 @@ class Pipeline:
         REQUIRES the previous definition of self.update, so call list_available_mappings() before.
         SETS self.loaded_chains"""
 
+         setproctitle(f"RNANet.py build_chains(retry={retry})")
+ 
         # Prepare folders
         if self.EXTRACT_CHAINS:
             if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
@@ -1239,19 +1276,25 @@ class Pipeline:
             exit(1)
 
         # If there were newly discovered problems, add this chain to the known issues
+         issues = 0
+         issues_names = []
         ki = open(runDir + "/known_issues.txt", 'a')
         kir = open(runDir + "/known_issues_reasons.txt", 'a')
         for c in results:
             if c[1].delete_me and c[1].chain_label not in self.known_issues:
                 if retry or "Could not load existing" not in c[1].error_messages:
                     self.known_issues.append(c[1].chain_label)
-                     warn(f"Adding {c[1].chain_label} to known issues.")
+                     issues += 1
+                     issues_names.append(c[1].chain_label)
                     ki.write(c[1].chain_label + '\n')
                     kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n')
                     with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
                         sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
         ki.close()
         kir.close()
+         if issues:
+             warn("Added newly discovered issues to known issues:")
+             print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True)
     
         # Add successfully built chains to list
         self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ]
@@ -1276,6 +1319,8 @@ class Pipeline:
         REQUIRES that self.loaded_chains is defined.
         SETS family (partially, through call)"""
 
+         setproctitle("RNANet.py prepare_sequences()")
+ 
         # Preparing a results folder
         if not os.access(path_to_seq_data + "realigned/", os.F_OK):
             os.makedirs(path_to_seq_data + "realigned/")
@@ -1308,6 +1353,8 @@ class Pipeline:
         REQUIRES self.fam_list to be defined
         SETS family (partially)"""
 
+         setproctitle("RNANet.py realign()")
+ 
         # Prepare the job list
         joblist = []
         for f in self.fam_list:
@@ -1345,6 +1392,8 @@ class Pipeline:
         
         REQUIRES self.fam_list to be defined"""
 
+         setproctitle("RNANet.py remap()")
+ 
         print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
         print("Check your CPU and disk I/O activity before deciding if the job failed.")
         nworkers =max(min(ncores, len(self.fam_list)), 1)
@@ -1357,11 +1406,11 @@ class Pipeline:
 
         # Start a process pool to dispatch the RNA families,
         # over multiple CPUs (one family by CPU)
-         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=5)
+         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=1)
 
         try:
             fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) 
-             for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list)): # Apply work_pssm to each RNA family
+             for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family
                 fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
             fam_pbar.close()
             p.close()
@@ -1378,6 +1427,8 @@ class Pipeline:
         
         REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)"""
     
+         setproctitle("RNANet.py output_results()")
+ 
         time_str = time.strftime("%Y%m%d")
 
         #Prepare folders:
@@ -1387,10 +1438,10 @@ class Pipeline:
             os.makedirs(runDir + "/results/archive/")
 
         # Save to by-chain CSV files
-         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
+         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3, maxtasksperchild=1)
         try:
             pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 
-             for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
+             for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)):
                 pbar.update(1) 
             pbar.close()
             p.close()
@@ -1439,6 +1490,8 @@ class Pipeline:
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
 
+         setproctitle("RNANet.py sanitize_database()")
+ 
         conn = sqlite3.connect(runDir + "/results/RNANet.db")
 
         # Assert every structure is used
@@ -1532,7 +1585,7 @@ def sql_define_tables(conn):
                 chain_id        INTEGER PRIMARY KEY NOT NULL,
                 structure_id    CHAR(4) NOT NULL,
                 chain_name      VARCHAR(2) NOT NULL,
-                 eq_class        VARCHAR(10),
+                 eq_class        VARCHAR(16),
                 pdb_start       SMALLINT,
                 pdb_end         SMALLINT,
                 issue           TINYINT,
@@ -1767,9 +1820,9 @@ def execute_joblist(fulljoblist):
 
             print("using", n, "processes:")
             # execute jobs of priority i that should be processed n by n:
-             p = Pool(processes=n, maxtasksperchild=5, initializer=init_worker)
+             p = Pool(processes=n, maxtasksperchild=1, initializer=init_worker)
             try:
-                 raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch)
+                 raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch, chunksize=2)
                 p.close()
                 p.join()
             except KeyboardInterrupt as e:
@@ -1792,6 +1845,9 @@ def work_infer_mappings(update_only, allmappings, codelist):
     build a list of Chain() objects mapped to Rfam families, by expanding available mappings 
     of any element of the list to all the list elements.
     """
+ 
+     setproctitle("RNAnet.py work_infer_mappings()")
+ 
     newchains = []
     known_mappings = pd.DataFrame()
 
@@ -1812,11 +1868,12 @@ def work_infer_mappings(update_only, allmappings, codelist):
     
     # Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings:
     if len(known_mappings):
+         
         families = set(known_mappings['rfam_acc'])
 
         # generalize
         inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates()
-         
+ 
         # check for approximative redundancy:
         if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
             # Then, there exists some mapping variants onto the same Rfam family CM,
@@ -1831,9 +1888,25 @@ def work_infer_mappings(update_only, allmappings, codelist):
                         len(inferred_mappings[thisfam_5_3]) !=  len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ])
                     and len(inferred_mappings[thisfam_5_3]) > 0
                 ):
-                     warn(f"There are mappings for {rfam} in both directions:", error=True)
-                     print(inferred_mappings)
-                     exit(1)
+                     # there are mappings in both directions... wtf Rfam ?!
+                     if (len(inferred_mappings[thisfam_5_3]) == len(inferred_mappings[thisfam_3_5]) == 1
+                         and int(inferred_mappings[thisfam_5_3].pdb_start) == int(inferred_mappings[thisfam_3_5].pdb_end)
+                         and int(inferred_mappings[thisfam_5_3].pdb_end) == int(inferred_mappings[thisfam_3_5].pdb_start)
+                     ):
+                         # The two mappings are on the same nucleotide interval, but in each sense.
+                         # e.g. RF00254 6v5b and 6v5c... maybe a bug on their side ? 
+                         # How can a chain match a CM in both senses ?
+                         # We keep only the 5->3 sense.
+                         inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
+                         sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
+                         thisfam_5_3 =  (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
+                         thisfam_3_5 =  (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
+                         print()
+                         warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
+                     else:
+                         warn(f"There are mappings for {rfam} in both directions:", error=True)
+                         print(inferred_mappings)
+                         exit(1)
 
                 # Compute consensus for chains in 5' -> 3' sense
                 if len(inferred_mappings[thisfam_5_3]):
@@ -1887,7 +1960,7 @@ def work_infer_mappings(update_only, allmappings, codelist):
                     pdb_start = int(m.pdb_start.min())
                     pdb_end = int(m.pdb_end.max())
                     inferred = False
-                 else: # otherwise, use the inferred mapping
+                 elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping
                     pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
                     pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
                     inferred = True
@@ -1911,6 +1984,8 @@ def work_mmcif(pdb_id):
     SETS table structure
     """
 
+     setproctitle(f"RNAnet.py work_mmcif({pdb_id})")
+ 
     final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif"
 
     # Attempt to download it if not present
@@ -1921,43 +1996,52 @@ def work_mmcif(pdb_id):
         warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
         return
 
-     # Load the MMCIF file with Biopython 
-     mmCif_info = MMCIF2Dict(final_filepath)
- 
-     # Get info about that structure
-     exp_meth = mmCif_info["_exptl.method"][0]
-     date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0]
-     if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']:
-         reso = float(mmCif_info["_refine.ls_d_res_high"][0])
-     elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']:
-         reso = float(mmCif_info["_refine.ls_d_res_low"][0])
-     elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
-         reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
-     else:
-         warn(f"Wtf, structure {pdb_id} has no resolution ?")
-         warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
-         reso = 0.0
-     
-     # Save into the database
+     # check if it exists in database
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-         sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
-                              VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
- 
-     # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
-     output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], 
-                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-     stdout = output.stdout.decode('utf-8')
-     stderr = output.stderr.decode('utf-8')
- 
-     if "exception" in stderr:
-         # DSSR is unable to parse the chain.
-         warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True)
-         return 1
+         r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""")
+ 
+     # if not, read the CIF header and register the structure
+     if not len(r):
+         # Load the MMCIF file with Biopython 
+         mmCif_info = MMCIF2Dict(final_filepath)
+ 
+         # Get info about that structure
+         exp_meth = mmCif_info["_exptl.method"][0]
+         date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0]
+         if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']:
+             reso = float(mmCif_info["_refine.ls_d_res_high"][0])
+         elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']:
+             reso = float(mmCif_info["_refine.ls_d_res_low"][0])
+         elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
+             reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
+         else:
+             warn(f"Wtf, structure {pdb_id} has no resolution ?")
+             warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
+             reso = 0.0
+         
+         # Save into the database
+         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+             sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
+                                 VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
+ 
+     if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
+ 
+         # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
+         output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], 
+                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+         stdout = output.stdout.decode('utf-8')
+         stderr = output.stderr.decode('utf-8')
+ 
+         if "exception" in stderr:
+             # DSSR is unable to parse the chain.
+             warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True)
+             return 1
+ 
+         # save the analysis to file only if we can load it :/
+         json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
+         json_file.write(stdout)
+         json_file.close()
 
-     # save the analysis to file only if we can load it :/
-     json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
-     json_file.write(stdout)
-     json_file.close()
     return 0
 
 @trace_unhandled_exceptions
@@ -1966,6 +2050,9 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     If asked, also extracts the 3D chains from their original structure files.
 
     """
+ 
+     setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
+ 
     if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
         warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
         c.delete_me = True
@@ -1997,6 +2084,8 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
 def work_prepare_sequences(dl, rfam_acc, chains):
     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
 
+     setproctitle("RNAnet.py work_prepare_sequences()")
+ 
     if rfam_acc in LSU_set | SSU_set: # rRNA
         if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
@@ -2087,6 +2176,8 @@ def work_realign(rfam_acc):
     cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
     """
 
+     setproctitle(f"RNAnet.py work_realign({rfam_acc})")
+ 
     if rfam_acc in LSU_set | SSU_set: 
         # Ribosomal subunits deserve a special treatment.
         # They require too much RAM to be aligned with Infernal.
@@ -2210,6 +2301,7 @@ def work_pssm(f, fill_gaps):
     Uses only 1 core, so this function can be called in parallel.
     
     """
+     setproctitle(f"RNAnet.py work_pssm({f})")
 
     # Get a worker number to position the progress bar
     global idxQueue
@@ -2223,6 +2315,7 @@ def work_pssm(f, fill_gaps):
     try:
         align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
     except:
+         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         with open(runDir + "/errors.txt", "a") as errf:
             errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
         return 1
@@ -2313,6 +2406,9 @@ def work_pssm(f, fill_gaps):
 
 @trace_unhandled_exceptions
 def work_save(c, homology=True):
+ 
+     setproctitle(f"RNAnet.py work_save({c.chain_label})")
+ 
     conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
     if homology:
         df = pd.read_sql_query(f"""
--- a/kill_rnanet.sh
View file @b484b3f
+++ b/kill_rnanet.sh
View file @b484b3f
@@ -12,7 +12,6 @@ done
 
 PROCESS_TO_KILL="statistics.py"
 PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
- KILLED=
 for KILLPID in $PROCESS_LIST; do
   if [ ! -z $KILLPID ];then
     kill -9 $KILLPID
--- a/recompute_some_chains.py
View file @b484b3f
+++ b/recompute_some_chains.py
View file @b484b3f
@@ -3,8 +3,8 @@ import subprocess, os, sys
 
 # Put a list of problematic chains here, they will be properly deleted and recomputed
 problems = [
- "4v9n_1_DA_1-2879",
- "4v9n_1_DA_148-2875"
+     "1k73_1_A",
+     "1k73_1_B"
 ]
 
 path_to_3D_data = sys.argv[1]
@@ -15,6 +15,7 @@ for p in problems:
     print()
     print()
     print()
+     homology = ('-' in p)
 
     # Remove the datapoints files and 3D files
     subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
@@ -25,16 +26,25 @@ for p in problems:
     # Find more information
     structure = p.split('_')[0]
     chain = p.split('_')[2]
-     families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto
- 
-     # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
-     for fam in families:
-         command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
+     if homology:
+         families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto
+ 
+         # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
+         for fam in families:
+             command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
+             print(' '.join(command))
+             subprocess.run(command)
+ 
+         command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
+     else:
+         # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
+         command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
         print(' '.join(command))
         subprocess.run(command)
 
+         command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p]
+ 
     # Re-run RNANet
-     command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
     print('\n',' '.join(command),'\n')
     subprocess.run(command)
 
--- a/statistics.py
View file @b484b3f
+++ b/statistics.py
View file @b484b3f
@@ -31,7 +31,7 @@ res_thr = 20.0 # default: all structures
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
 
- def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
+ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
     See Wadley & Pyle (2007)
@@ -63,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
         exit("You overestimate my capabilities !")
 
     
-     if not path.isfile(f"data/wadley_kernel_{angle}_{res_thr}A.npz"):
+     if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
 
         # Get a worker number to position the progress bar
         global idxQueue
@@ -75,7 +75,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
                                  FROM nucleotide JOIN (
                                     SELECT chain_id FROM chain JOIN structure
-                                     WHERE structure.resolution <= {res_thr}
+                                     WHERE structure.resolution <= {res}
                                  ) AS c
                                  WHERE puckering="C2'-endo" 
                                     AND {angle} IS NOT NULL 
@@ -85,7 +85,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
                                  FROM nucleotide JOIN (
                                     SELECT chain_id FROM chain JOIN structure
-                                     WHERE structure.resolution <= {res_thr}
+                                     WHERE structure.resolution <= {res}
                                  ) AS c
                                  WHERE form = '.' 
                                     AND puckering="C3'-endo" 
@@ -111,14 +111,14 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
         pbar.update(1)
 
         # Save the data to an archive for later use without the need to recompute
-         np.savez(f"data/wadley_kernel_{angle}.npz",
+         np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
         pbar.close()
         idxQueue.put(thr_idx)
     else:
-         f = np.load(f"data/wadley_kernel_{angle}.npz")
+         f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
         c2_endo_etas = f["c2_endo_e"]
         c3_endo_etas = f["c3_endo_e"]
         c2_endo_thetas = f["c2_endo_t"]
@@ -157,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
         ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res_thr}A.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -168,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
         ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res_thr}A.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -178,10 +178,9 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
         ax = fig.gca()
         ax.scatter(x, y, s=1, alpha=0.1)
         ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
- 
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res_thr}A.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
         if show:
             fig.show()
         plt.close()
@@ -231,7 +230,13 @@ def stats_len():
 
         # Get the lengths of chains
         with sqlite3.connect("results/RNANet.db") as conn:
-             l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id WHERE rfam_acc='{f}' AND resolution <= {res_thr}) NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
+             l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 
+                                                             FROM (
+                                                                 SELECT chain_id 
+                                                                 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 
+                                                                 WHERE rfam_acc='{f}' AND resolution <= {res_thr}
+                                                             ) NATURAL JOIN nucleotide 
+                                                             GROUP BY chain_id;""", warn_every=0) ]
         lengths.append(l) # list of chain lengths from the family
 
         # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
@@ -597,6 +602,172 @@ def per_chain_stats():
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
     notify("Updated the database with per-chain base frequencies")
 
+ def general_stats():
+     """
+     Number of structures as function of the resolution threshold
+     Number of Rfam families as function of the resolution threshold
+     """
+     with sqlite3.connect("results/RNANet.db") as conn:
+         df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+                                         FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
+                                         WHERE rfam_acc IS NULL AND ISSUE=0;""", conn)
+         df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
+                                             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
+                                             WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
+         df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
+                                             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
+                                             WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
+         df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
+                                                     FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
+                                                             FROM chain 
+                                                             JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 
+                                                                     FROM chain 
+                                                                     WHERE rfam_acc IS NOT NULL AND issue=0 
+                                                                     GROUP BY structure_id, chain_name
+                                                             ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 
+                                                             WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0
+                                                     ) AS c
+                                                     JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
+     print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
+ 
+     ##########################################
+     # plot N = f(resolution, exp_method)
+     ##########################################
+ 
+     methods = df_unique.exp_method.unique()
+ 
+     fig, axs = plt.subplots(1+len(methods), 3, figsize=(15,5*(1+len(methods))), sharex=True)
+     df_unique.sort_values('resolution', inplace=True, ignore_index=True)
+     df_mapped_unique.sort_values('resolution', inplace=True, ignore_index=True)
+     df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
+     df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
+     max_res = max(df_unique.resolution)
+     max_structs = len(df_mapped_copies.index.tolist())
+     colors = np.linspace(0,1,1+len(methods))
+     plt.xticks( np.arange(0, max_res+2, 2.0).tolist(),  np.arange(0, max_res+2, 2.0).tolist() )
+ 
+     axs[0][0].grid(axis='y', ls='dotted', lw=1)
+     axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
+     axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
+     axs[0][0].text(0.95*max_res, 0.95*len(df_unique.resolution), "%d " %  len(df_unique.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+     axs[0][0].set_ylabel("ALL", fontsize=14)
+     axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
+     axs[0][0].set_ylim((0, max_structs * 1.05))
+     axs[0][0].legend(loc="best", fontsize=14)
+ 
+     axs[0][1].grid(axis='y', ls='dotted', lw=1)
+     axs[0][1].set_yticklabels([])
+     axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
+     axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
+     axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
+     axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " %  len(df_mapped_unique.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+     axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
+     axs[0][1].set_ylim((0, max_structs * 1.05))
+     axs[0][1].legend(loc="best", fontsize=14)
+ 
+     axs[0][2].grid(axis='y', ls='dotted', lw=1)
+     axs[0][2].set_yticklabels([])
+     axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
+     axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
+     axs[0][2].hist(df_mapped_copies[df_mapped_copies.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='inferred')
+     axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " %  len(df_mapped_copies.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+     axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
+     axs[0][2].legend(loc="right", fontsize=14)
+     axs[0][2].set_ylim((0, max_structs * 1.05))
+ 
+     for i,m in enumerate(methods):
+         df_unique_m = df_unique[df_unique.exp_method == m]
+         df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
+         df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
+         df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
+         max_structs = len(df_mapped_copies_m.resolution.tolist())
+         print("> found", max_structs, "structures with method", m, flush=True)
+ 
+         axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
+         axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
+         axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
+         axs[1+i][0].text(0.95*max_res, 0.95*len(df_unique_m.resolution), "%d " %  len(df_unique_m.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+         axs[1+i][0].set_ylim((0, max_structs * 1.05))
+         axs[1+i][0].set_ylabel(m, fontsize=14)
+         axs[1+i][0].legend(loc="best", fontsize=14)
+ 
+         axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
+         axs[1+i][1].set_yticklabels([])
+         axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
+         axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
+         axs[1+i][1].hist(df_inferred_only_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='only by inference')
+         axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " %  len(df_mapped_unique_m.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+         axs[1+i][1].set_ylim((0, max_structs * 1.05))
+         axs[1+i][1].legend(loc="best", fontsize=14)
+         
+         axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
+         axs[1+i][2].set_yticklabels([])
+         axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
+         axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
+         axs[1+i][2].hist(df_mapped_copies_m[df_mapped_copies_m.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='inferred')
+         axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " %  len(df_mapped_copies_m.resolution), 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+         axs[1+i][2].set_ylim((0, max_structs * 1.05))
+         axs[1+i][2].legend(loc="right", fontsize=14)
+     
+     axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
+     axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
+     axs[-1][2].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
+ 
+     fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
+     fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, 
+                         hspace=0.05, bottom=0.05, top=0.92)
+     fig.savefig("results/figures/resolutions.png")
+     plt.close()
+ 
+     ##########################################
+     # plot Nfam = f(resolution, exp_method)
+     ##########################################
+ 
+     df_mapped_copies['n_fam'] = [ len(df_mapped_copies.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies.index)) ]
+ 
+     fig, axs = plt.subplots(1, 1+len(methods), figsize=(5*(1+len(methods)), 5))
+     max_res = max(df_mapped_copies.resolution)
+     max_fams = max(df_mapped_copies.n_fam)
+     colors = np.linspace(0,1,1+len(methods))
+     plt.xticks( np.arange(0, max_res+2, 2.0).tolist(),  np.arange(0, max_res+2, 2.0).tolist() )
+ 
+     axs[0].grid(axis='y', ls='dotted', lw=1)
+     axs[0].plot(df_mapped_copies.resolution, df_mapped_copies.n_fam)
+     axs[0].text(0.95*max_res, 0.95*df_mapped_copies.n_fam.iloc[-1], "%d " %  df_mapped_copies.n_fam.iloc[-1], 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+     axs[0].set_title("ALL", fontsize=14)
+     axs[0].set_xlabel("Structure resolution (Angströms)", fontsize=14)
+     axs[0].set_ylabel("Number of Rfam families", fontsize=14)
+     axs[0].set_ylim((0, max_res * 1.05))
+     axs[0].set_ylim((0, max_fams * 1.05))
+     
+     for i,m in enumerate(methods):
+         df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m].drop("n_fam", axis=1).copy()
+         df_mapped_copies_m['n_fam'] = [ len(df_mapped_copies_m.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies_m.index)) ]
+         print(">", df_mapped_copies_m.n_fam.iloc[-1], "different RNA families have a 3D structure solved by", m)
+ 
+         axs[1+i].grid(axis='y', ls='dotted', lw=1)
+         axs[1+i].plot(df_mapped_copies_m.resolution, df_mapped_copies_m.n_fam, )
+         axs[1+i].text(0.95*max(df_mapped_copies_m.resolution), 0.95*df_mapped_copies_m.n_fam.iloc[-1], "%d " %  df_mapped_copies_m.n_fam.iloc[-1], 
+                          horizontalalignment='right', verticalalignment='top', fontsize=14)
+         axs[1+i].set_xlim((0, max_res * 1.05))
+         axs[1+i].set_ylim((0, max_fams * 1.05))
+         axs[1+i].set_xlabel("Structure resolution (Angströms)", fontsize=14)
+         axs[1+i].set_title(m, fontsize=14)
+         axs[1+i].set_yticklabels([])
+     
+     fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
+     fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, 
+                         hspace=0.05, bottom=0.12, top=0.84)
+     fig.savefig("results/figures/Nfamilies.png")
+     plt.close()
+ 
 def log_to_pbar(pbar):
     def update(r):
         pbar.update(1)
@@ -604,6 +775,9 @@ def log_to_pbar(pbar):
 
 if __name__ == "__main__":
 
+     general_stats()
+     exit()
+ 
     # parse options
     try:
         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
@@ -665,8 +839,8 @@ if __name__ == "__main__":
 
     # Define the tasks
     joblist = []
-     # joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
-     # joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
+     joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0)))   # res threshold is 4.0 Angstroms by default
+     joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0)))   #
     joblist.append(Job(function=stats_len)) # Computes figures
     # joblist.append(Job(function=stats_freq)) # updates the database
     # for f in famlist: