better warnings

Louis BECQUEY
Commit 6e48f628899ad4b075bdf596243e750fff778c1e 6e48f628 1 parent b910de9e
Showing 1 changed file with 34 additions and 20 deletions
RNAnet.py
--- a/RNAnet.py
View file @6e48f62
+++ b/RNAnet.py
View file @6e48f62
 #!/usr/bin/python3.8
 import numpy as np
 import pandas as pd
-import concurrent.futures, Bio.PDB.StructureBuilder, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
+import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
 from Bio import AlignIO, SeqIO
 from Bio.PDB import MMCIFParser
 from Bio.PDB.mmcifio import MMCIFIO
@@ -218,9 +218,9 @@ class Chain:
         # Print eventual warnings given by DSSR, and abort if there are some
         if "warning" in json_object.keys():
-            warn(f"Ignoring {self.chain_label} ({json_object['warning']})")
+            warn(f"found DSSR warning in annotation {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}.")
             self.delete_me = True
-            self.error_messages = f"DSSR warning for {self.chain_label}: {json_object['warning']}"
+            self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}."
             return 1
         try:
@@ -228,6 +228,11 @@ class Chain:
             nts = json_object["nts"]                         # sub-json-object
             df = pd.DataFrame(nts)                           # conversion to dataframe
             df = df[ df.chain_name == self.pdb_chain_id ]    # keeping only this chain's nucleotides
+            if df.empty:
+                warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True)
+                self.delete_me = True
+                self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
+                return 1
             # remove low pertinence or undocumented descriptors
             cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
@@ -242,7 +247,7 @@ class Chain:
             df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4',
                         'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi)
         except KeyError as e:
-            warn(f"Error while parsing DSSR's {self.chain_label} json output:{e}", error=True)
+            warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
             self.delete_me = True
             self.error_messages = f"Error while parsing DSSR's json output:\n{e}"
             return 1
@@ -265,9 +270,9 @@ class Chain:
         try:
             l = df.iloc[-1,1] - df.iloc[0,1] + 1    # length of chain from nt_resnum point of view
         except IndexError:
-            warn(f"Error while parsing DSSR's annotation: No nucleotides are part of {self.chain_label}!", error=True)
+            warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True)
             self.delete_me = True
-            self.error_messages = f"Error while parsing DSSR's json output: No nucleotides from {self.chain_label}. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
+            self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
             return 1
         # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
@@ -846,7 +851,7 @@ class Pipeline:
                         "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).")
                 print()
                 print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them")
-                print("--update-homologous\t\tRe-download Rfam sequences and SILVA arb databases, and realign all families")
+                print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files")
                 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.")
                 print()
                 print("Typical usage:")
@@ -893,6 +898,9 @@ class Pipeline:
                                 runDir + "/known_issues_reasons.txt", 
                                 runDir + "/results/RNANet.db"])
             elif opt == "--update-homologous":
+                if "tobedefinedbyoptions" == path_to_seq_data:
+                    warn("Please provide --seq-folder before --update-homologous in the list of options.", error=True)
+                    exit(1)
                 warn("Deleting previous sequence files and recomputing alignments.")
                 subprocess.run(["rm", "-rf", 
                                 path_to_seq_data + "realigned",
@@ -1136,7 +1144,7 @@ class Pipeline:
                                  WHERE rfam_acc = ?;""", many=True, data=data)
     def remap(self):
-        """Compute nucleotide frequencies of the previous alignments and save them in the database
+        """Compute nucleotide frequencies of some alignments and save them in the database
         REQUIRES self.fam_list to be defined"""
@@ -1548,6 +1556,9 @@ def execute_joblist(fulljoblist):
     # Sort jobs in a tree structure, first by priority, then by CPU numbers
     jobs = {}
     jobcount = len(fulljoblist)
+    if not jobcount:
+        warn("nothing to do !")
+        return []
     for job in fulljoblist:
         if job.priority_ not in jobs.keys():
             jobs[job.priority_] = {}
@@ -1720,10 +1731,11 @@ def work_mmcif(pdb_id):
     url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id)
     final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif"
-    # Attempt to download it
+    # Attempt to download it if not present
     try:
-        _urlcleanup()
+        if not path.isfile(final_filepath):
-        _urlretrieve(url, final_filepath)
+            _urlcleanup()
+            _urlretrieve(url, final_filepath)
     except:
         warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
         return
@@ -1741,11 +1753,12 @@ def work_mmcif(pdb_id):
     elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
         reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
     else:
-        warn(f"Wtf, structure {pdb_id} has no resolution ? Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
+        warn(f"Wtf, structure {pdb_id} has no resolution ?")
+        warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
         reso = 0.0
     # Save into the database
-    with sqlite3.connect(runDir + "/results/RNANet.db")as conn:
+    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
                              VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
@@ -2157,15 +2170,16 @@ if __name__ == "__main__":
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
-    pp.prepare_sequences()
+    if len(pp.fam_list):
-    pp.realign()
+        pp.prepare_sequences()
+        pp.realign()
-    # At this point, the family table is up to date    
+        # At this point, the family table is up to date    
-    thr_idx_mgr = Manager()
+        thr_idx_mgr = Manager()
-    idxQueue = thr_idx_mgr.Queue()
+        idxQueue = thr_idx_mgr.Queue()
-    pp.remap()
+        pp.remap()
     # At this point, the align_column and re_mapping tables are up-to-date.
@@ -2179,4 +2193,4 @@ if __name__ == "__main__":
     print("Completed.")  # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
     # # so i can sleep for the end of the night
-    # subprocess.run(["shutdown","now"])
+    # subprocess.run(["poweroff"])