Showing
1 changed file
with
34 additions
and
20 deletions
1 | #!/usr/bin/python3.8 | 1 | #!/usr/bin/python3.8 |
2 | import numpy as np | 2 | import numpy as np |
3 | import pandas as pd | 3 | import pandas as pd |
4 | -import concurrent.futures, Bio.PDB.StructureBuilder, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings | 4 | +import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings |
5 | from Bio import AlignIO, SeqIO | 5 | from Bio import AlignIO, SeqIO |
6 | from Bio.PDB import MMCIFParser | 6 | from Bio.PDB import MMCIFParser |
7 | from Bio.PDB.mmcifio import MMCIFIO | 7 | from Bio.PDB.mmcifio import MMCIFIO |
... | @@ -218,9 +218,9 @@ class Chain: | ... | @@ -218,9 +218,9 @@ class Chain: |
218 | 218 | ||
219 | # Print eventual warnings given by DSSR, and abort if there are some | 219 | # Print eventual warnings given by DSSR, and abort if there are some |
220 | if "warning" in json_object.keys(): | 220 | if "warning" in json_object.keys(): |
221 | - warn(f"Ignoring {self.chain_label} ({json_object['warning']})") | 221 | + warn(f"found DSSR warning in annotation {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}.") |
222 | self.delete_me = True | 222 | self.delete_me = True |
223 | - self.error_messages = f"DSSR warning for {self.chain_label}: {json_object['warning']}" | 223 | + self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}." |
224 | return 1 | 224 | return 1 |
225 | 225 | ||
226 | try: | 226 | try: |
... | @@ -228,6 +228,11 @@ class Chain: | ... | @@ -228,6 +228,11 @@ class Chain: |
228 | nts = json_object["nts"] # sub-json-object | 228 | nts = json_object["nts"] # sub-json-object |
229 | df = pd.DataFrame(nts) # conversion to dataframe | 229 | df = pd.DataFrame(nts) # conversion to dataframe |
230 | df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides | 230 | df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides |
231 | + if df.empty: | ||
232 | + warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) | ||
233 | + self.delete_me = True | ||
234 | + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | ||
235 | + return 1 | ||
231 | 236 | ||
232 | # remove low pertinence or undocumented descriptors | 237 | # remove low pertinence or undocumented descriptors |
233 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", | 238 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", |
... | @@ -242,7 +247,7 @@ class Chain: | ... | @@ -242,7 +247,7 @@ class Chain: |
242 | df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', | 247 | df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', |
243 | 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi) | 248 | 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi) |
244 | except KeyError as e: | 249 | except KeyError as e: |
245 | - warn(f"Error while parsing DSSR's {self.chain_label} json output:{e}", error=True) | 250 | + warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) |
246 | self.delete_me = True | 251 | self.delete_me = True |
247 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" | 252 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" |
248 | return 1 | 253 | return 1 |
... | @@ -265,9 +270,9 @@ class Chain: | ... | @@ -265,9 +270,9 @@ class Chain: |
265 | try: | 270 | try: |
266 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view | 271 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view |
267 | except IndexError: | 272 | except IndexError: |
268 | - warn(f"Error while parsing DSSR's annotation: No nucleotides are part of {self.chain_label}!", error=True) | 273 | + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) |
269 | self.delete_me = True | 274 | self.delete_me = True |
270 | - self.error_messages = f"Error while parsing DSSR's json output: No nucleotides from {self.chain_label}. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 275 | + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." |
271 | return 1 | 276 | return 1 |
272 | 277 | ||
273 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 278 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one |
... | @@ -846,7 +851,7 @@ class Pipeline: | ... | @@ -846,7 +851,7 @@ class Pipeline: |
846 | "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).") | 851 | "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).") |
847 | print() | 852 | print() |
848 | print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") | 853 | print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") |
849 | - print("--update-homologous\t\tRe-download Rfam sequences and SILVA arb databases, and realign all families") | 854 | + print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") |
850 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") | 855 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") |
851 | print() | 856 | print() |
852 | print("Typical usage:") | 857 | print("Typical usage:") |
... | @@ -893,6 +898,9 @@ class Pipeline: | ... | @@ -893,6 +898,9 @@ class Pipeline: |
893 | runDir + "/known_issues_reasons.txt", | 898 | runDir + "/known_issues_reasons.txt", |
894 | runDir + "/results/RNANet.db"]) | 899 | runDir + "/results/RNANet.db"]) |
895 | elif opt == "--update-homologous": | 900 | elif opt == "--update-homologous": |
901 | + if "tobedefinedbyoptions" == path_to_seq_data: | ||
902 | + warn("Please provide --seq-folder before --update-homologous in the list of options.", error=True) | ||
903 | + exit(1) | ||
896 | warn("Deleting previous sequence files and recomputing alignments.") | 904 | warn("Deleting previous sequence files and recomputing alignments.") |
897 | subprocess.run(["rm", "-rf", | 905 | subprocess.run(["rm", "-rf", |
898 | path_to_seq_data + "realigned", | 906 | path_to_seq_data + "realigned", |
... | @@ -1136,7 +1144,7 @@ class Pipeline: | ... | @@ -1136,7 +1144,7 @@ class Pipeline: |
1136 | WHERE rfam_acc = ?;""", many=True, data=data) | 1144 | WHERE rfam_acc = ?;""", many=True, data=data) |
1137 | 1145 | ||
1138 | def remap(self): | 1146 | def remap(self): |
1139 | - """Compute nucleotide frequencies of the previous alignments and save them in the database | 1147 | + """Compute nucleotide frequencies of some alignments and save them in the database |
1140 | 1148 | ||
1141 | REQUIRES self.fam_list to be defined""" | 1149 | REQUIRES self.fam_list to be defined""" |
1142 | 1150 | ||
... | @@ -1548,6 +1556,9 @@ def execute_joblist(fulljoblist): | ... | @@ -1548,6 +1556,9 @@ def execute_joblist(fulljoblist): |
1548 | # Sort jobs in a tree structure, first by priority, then by CPU numbers | 1556 | # Sort jobs in a tree structure, first by priority, then by CPU numbers |
1549 | jobs = {} | 1557 | jobs = {} |
1550 | jobcount = len(fulljoblist) | 1558 | jobcount = len(fulljoblist) |
1559 | + if not jobcount: | ||
1560 | + warn("nothing to do !") | ||
1561 | + return [] | ||
1551 | for job in fulljoblist: | 1562 | for job in fulljoblist: |
1552 | if job.priority_ not in jobs.keys(): | 1563 | if job.priority_ not in jobs.keys(): |
1553 | jobs[job.priority_] = {} | 1564 | jobs[job.priority_] = {} |
... | @@ -1720,10 +1731,11 @@ def work_mmcif(pdb_id): | ... | @@ -1720,10 +1731,11 @@ def work_mmcif(pdb_id): |
1720 | url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id) | 1731 | url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id) |
1721 | final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" | 1732 | final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" |
1722 | 1733 | ||
1723 | - # Attempt to download it | 1734 | + # Attempt to download it if not present |
1724 | try: | 1735 | try: |
1725 | - _urlcleanup() | 1736 | + if not path.isfile(final_filepath): |
1726 | - _urlretrieve(url, final_filepath) | 1737 | + _urlcleanup() |
1738 | + _urlretrieve(url, final_filepath) | ||
1727 | except: | 1739 | except: |
1728 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) | 1740 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) |
1729 | return | 1741 | return |
... | @@ -1741,11 +1753,12 @@ def work_mmcif(pdb_id): | ... | @@ -1741,11 +1753,12 @@ def work_mmcif(pdb_id): |
1741 | elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']: | 1753 | elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']: |
1742 | reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0]) | 1754 | reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0]) |
1743 | else: | 1755 | else: |
1744 | - warn(f"Wtf, structure {pdb_id} has no resolution ? Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.") | 1756 | + warn(f"Wtf, structure {pdb_id} has no resolution ?") |
1757 | + warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.") | ||
1745 | reso = 0.0 | 1758 | reso = 0.0 |
1746 | 1759 | ||
1747 | # Save into the database | 1760 | # Save into the database |
1748 | - with sqlite3.connect(runDir + "/results/RNANet.db")as conn: | 1761 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1749 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) | 1762 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) |
1750 | VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) | 1763 | VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) |
1751 | 1764 | ||
... | @@ -2157,15 +2170,16 @@ if __name__ == "__main__": | ... | @@ -2157,15 +2170,16 @@ if __name__ == "__main__": |
2157 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:") | 2170 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:") |
2158 | pp.fam_list = sorted(rfam_acc_to_download.keys()) | 2171 | pp.fam_list = sorted(rfam_acc_to_download.keys()) |
2159 | 2172 | ||
2160 | - pp.prepare_sequences() | 2173 | + if len(pp.fam_list): |
2161 | - pp.realign() | 2174 | + pp.prepare_sequences() |
2175 | + pp.realign() | ||
2162 | 2176 | ||
2163 | - # At this point, the family table is up to date | 2177 | + # At this point, the family table is up to date |
2164 | 2178 | ||
2165 | - thr_idx_mgr = Manager() | 2179 | + thr_idx_mgr = Manager() |
2166 | - idxQueue = thr_idx_mgr.Queue() | 2180 | + idxQueue = thr_idx_mgr.Queue() |
2167 | 2181 | ||
2168 | - pp.remap() | 2182 | + pp.remap() |
2169 | 2183 | ||
2170 | # At this point, the align_column and re_mapping tables are up-to-date. | 2184 | # At this point, the align_column and re_mapping tables are up-to-date. |
2171 | 2185 | ||
... | @@ -2179,4 +2193,4 @@ if __name__ == "__main__": | ... | @@ -2179,4 +2193,4 @@ if __name__ == "__main__": |
2179 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove | 2193 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove |
2180 | 2194 | ||
2181 | # # so i can sleep for the end of the night | 2195 | # # so i can sleep for the end of the night |
2182 | - # subprocess.run(["shutdown","now"]) | 2196 | + # subprocess.run(["poweroff"]) | ... | ... |
-
Please register or login to post a comment