Louis BECQUEY

better warnings

Showing 1 changed file with 34 additions and 20 deletions
1 #!/usr/bin/python3.8 1 #!/usr/bin/python3.8
2 import numpy as np 2 import numpy as np
3 import pandas as pd 3 import pandas as pd
4 -import concurrent.futures, Bio.PDB.StructureBuilder, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings 4 +import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
5 from Bio import AlignIO, SeqIO 5 from Bio import AlignIO, SeqIO
6 from Bio.PDB import MMCIFParser 6 from Bio.PDB import MMCIFParser
7 from Bio.PDB.mmcifio import MMCIFIO 7 from Bio.PDB.mmcifio import MMCIFIO
...@@ -218,9 +218,9 @@ class Chain: ...@@ -218,9 +218,9 @@ class Chain:
218 218
219 # Print eventual warnings given by DSSR, and abort if there are some 219 # Print eventual warnings given by DSSR, and abort if there are some
220 if "warning" in json_object.keys(): 220 if "warning" in json_object.keys():
221 - warn(f"Ignoring {self.chain_label} ({json_object['warning']})") 221 + warn(f"found DSSR warning in annotation {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}.")
222 self.delete_me = True 222 self.delete_me = True
223 - self.error_messages = f"DSSR warning for {self.chain_label}: {json_object['warning']}" 223 + self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}."
224 return 1 224 return 1
225 225
226 try: 226 try:
...@@ -228,6 +228,11 @@ class Chain: ...@@ -228,6 +228,11 @@ class Chain:
228 nts = json_object["nts"] # sub-json-object 228 nts = json_object["nts"] # sub-json-object
229 df = pd.DataFrame(nts) # conversion to dataframe 229 df = pd.DataFrame(nts) # conversion to dataframe
230 df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides 230 df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides
231 + if df.empty:
232 + warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True)
233 + self.delete_me = True
234 + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
235 + return 1
231 236
232 # remove low pertinence or undocumented descriptors 237 # remove low pertinence or undocumented descriptors
233 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", 238 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
...@@ -242,7 +247,7 @@ class Chain: ...@@ -242,7 +247,7 @@ class Chain:
242 df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', 247 df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4',
243 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi) 248 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi)
244 except KeyError as e: 249 except KeyError as e:
245 - warn(f"Error while parsing DSSR's {self.chain_label} json output:{e}", error=True) 250 + warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
246 self.delete_me = True 251 self.delete_me = True
247 self.error_messages = f"Error while parsing DSSR's json output:\n{e}" 252 self.error_messages = f"Error while parsing DSSR's json output:\n{e}"
248 return 1 253 return 1
...@@ -265,9 +270,9 @@ class Chain: ...@@ -265,9 +270,9 @@ class Chain:
265 try: 270 try:
266 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view 271 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view
267 except IndexError: 272 except IndexError:
268 - warn(f"Error while parsing DSSR's annotation: No nucleotides are part of {self.chain_label}!", error=True) 273 + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True)
269 self.delete_me = True 274 self.delete_me = True
270 - self.error_messages = f"Error while parsing DSSR's json output: No nucleotides from {self.chain_label}. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 275 + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
271 return 1 276 return 1
272 277
273 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 278 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
...@@ -846,7 +851,7 @@ class Pipeline: ...@@ -846,7 +851,7 @@ class Pipeline:
846 "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).") 851 "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).")
847 print() 852 print()
848 print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") 853 print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them")
849 - print("--update-homologous\t\tRe-download Rfam sequences and SILVA arb databases, and realign all families") 854 + print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files")
850 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") 855 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.")
851 print() 856 print()
852 print("Typical usage:") 857 print("Typical usage:")
...@@ -893,6 +898,9 @@ class Pipeline: ...@@ -893,6 +898,9 @@ class Pipeline:
893 runDir + "/known_issues_reasons.txt", 898 runDir + "/known_issues_reasons.txt",
894 runDir + "/results/RNANet.db"]) 899 runDir + "/results/RNANet.db"])
895 elif opt == "--update-homologous": 900 elif opt == "--update-homologous":
901 + if "tobedefinedbyoptions" == path_to_seq_data:
902 + warn("Please provide --seq-folder before --update-homologous in the list of options.", error=True)
903 + exit(1)
896 warn("Deleting previous sequence files and recomputing alignments.") 904 warn("Deleting previous sequence files and recomputing alignments.")
897 subprocess.run(["rm", "-rf", 905 subprocess.run(["rm", "-rf",
898 path_to_seq_data + "realigned", 906 path_to_seq_data + "realigned",
...@@ -1136,7 +1144,7 @@ class Pipeline: ...@@ -1136,7 +1144,7 @@ class Pipeline:
1136 WHERE rfam_acc = ?;""", many=True, data=data) 1144 WHERE rfam_acc = ?;""", many=True, data=data)
1137 1145
1138 def remap(self): 1146 def remap(self):
1139 - """Compute nucleotide frequencies of the previous alignments and save them in the database 1147 + """Compute nucleotide frequencies of some alignments and save them in the database
1140 1148
1141 REQUIRES self.fam_list to be defined""" 1149 REQUIRES self.fam_list to be defined"""
1142 1150
...@@ -1548,6 +1556,9 @@ def execute_joblist(fulljoblist): ...@@ -1548,6 +1556,9 @@ def execute_joblist(fulljoblist):
1548 # Sort jobs in a tree structure, first by priority, then by CPU numbers 1556 # Sort jobs in a tree structure, first by priority, then by CPU numbers
1549 jobs = {} 1557 jobs = {}
1550 jobcount = len(fulljoblist) 1558 jobcount = len(fulljoblist)
1559 + if not jobcount:
1560 + warn("nothing to do !")
1561 + return []
1551 for job in fulljoblist: 1562 for job in fulljoblist:
1552 if job.priority_ not in jobs.keys(): 1563 if job.priority_ not in jobs.keys():
1553 jobs[job.priority_] = {} 1564 jobs[job.priority_] = {}
...@@ -1720,10 +1731,11 @@ def work_mmcif(pdb_id): ...@@ -1720,10 +1731,11 @@ def work_mmcif(pdb_id):
1720 url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id) 1731 url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id)
1721 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" 1732 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif"
1722 1733
1723 - # Attempt to download it 1734 + # Attempt to download it if not present
1724 try: 1735 try:
1725 - _urlcleanup() 1736 + if not path.isfile(final_filepath):
1726 - _urlretrieve(url, final_filepath) 1737 + _urlcleanup()
1738 + _urlretrieve(url, final_filepath)
1727 except: 1739 except:
1728 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) 1740 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
1729 return 1741 return
...@@ -1741,11 +1753,12 @@ def work_mmcif(pdb_id): ...@@ -1741,11 +1753,12 @@ def work_mmcif(pdb_id):
1741 elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']: 1753 elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
1742 reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0]) 1754 reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
1743 else: 1755 else:
1744 - warn(f"Wtf, structure {pdb_id} has no resolution ? Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.") 1756 + warn(f"Wtf, structure {pdb_id} has no resolution ?")
1757 + warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
1745 reso = 0.0 1758 reso = 0.0
1746 1759
1747 # Save into the database 1760 # Save into the database
1748 - with sqlite3.connect(runDir + "/results/RNANet.db")as conn: 1761 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1749 sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) 1762 sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
1750 VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) 1763 VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
1751 1764
...@@ -2157,15 +2170,16 @@ if __name__ == "__main__": ...@@ -2157,15 +2170,16 @@ if __name__ == "__main__":
2157 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:") 2170 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:")
2158 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2171 pp.fam_list = sorted(rfam_acc_to_download.keys())
2159 2172
2160 - pp.prepare_sequences() 2173 + if len(pp.fam_list):
2161 - pp.realign() 2174 + pp.prepare_sequences()
2175 + pp.realign()
2162 2176
2163 - # At this point, the family table is up to date 2177 + # At this point, the family table is up to date
2164 2178
2165 - thr_idx_mgr = Manager() 2179 + thr_idx_mgr = Manager()
2166 - idxQueue = thr_idx_mgr.Queue() 2180 + idxQueue = thr_idx_mgr.Queue()
2167 2181
2168 - pp.remap() 2182 + pp.remap()
2169 2183
2170 # At this point, the align_column and re_mapping tables are up-to-date. 2184 # At this point, the align_column and re_mapping tables are up-to-date.
2171 2185
...@@ -2179,4 +2193,4 @@ if __name__ == "__main__": ...@@ -2179,4 +2193,4 @@ if __name__ == "__main__":
2179 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove 2193 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
2180 2194
2181 # # so i can sleep for the end of the night 2195 # # so i can sleep for the end of the night
2182 - # subprocess.run(["shutdown","now"]) 2196 + # subprocess.run(["poweroff"])
......