Louis BECQUEY

more resolution-specific statistics

1 #!/usr/bin/python3.8 1 #!/usr/bin/python3.8
2 import Bio 2 import Bio
3 +import Bio.PDB as pdb
3 import concurrent.futures 4 import concurrent.futures
4 import getopt 5 import getopt
5 import gzip 6 import gzip
...@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager ...@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager
25 from time import sleep 26 from time import sleep
26 from tqdm import tqdm 27 from tqdm import tqdm
27 from setproctitle import setproctitle 28 from setproctitle import setproctitle
28 - 29 +from Bio import AlignIO, SeqIO
30 +from Bio.Align import AlignInfo
29 31
30 def trace_unhandled_exceptions(func): 32 def trace_unhandled_exceptions(func):
31 @wraps(func) 33 @wraps(func)
...@@ -112,7 +114,7 @@ class SelectivePortionSelector(object): ...@@ -112,7 +114,7 @@ class SelectivePortionSelector(object):
112 return 1 114 return 1
113 115
114 116
115 -class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): 117 +class BufferingSummaryInfo(AlignInfo.SummaryInfo):
116 118
117 def get_pssm(self, family, index): 119 def get_pssm(self, family, index):
118 """Create a position specific score matrix object for the alignment. 120 """Create a position specific score matrix object for the alignment.
...@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): ...@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
139 score_dict[this_residue] = 1.0 141 score_dict[this_residue] = 1.0
140 pssm_info.append(('*', score_dict)) 142 pssm_info.append(('*', score_dict))
141 143
142 - return Bio.Align.AlignInfo.PSSM(pssm_info) 144 + return AlignInfo.PSSM(pssm_info)
143 145
144 146
145 class Chain: 147 class Chain:
...@@ -198,11 +200,11 @@ class Chain: ...@@ -198,11 +200,11 @@ class Chain:
198 200
199 with warnings.catch_warnings(): 201 with warnings.catch_warnings():
200 # Ignore the PDB problems. This mostly warns that some chain is discontinuous. 202 # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
201 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) 203 + warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning)
202 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) 204 + warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning)
203 205
204 # Load the whole mmCIF into a Biopython structure object: 206 # Load the whole mmCIF into a Biopython structure object:
205 - mmcif_parser = Bio.PDB.MMCIFParser() 207 + mmcif_parser = pdb.MMCIFParser()
206 try: 208 try:
207 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") 209 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
208 except ValueError as e: 210 except ValueError as e:
...@@ -223,7 +225,7 @@ class Chain: ...@@ -223,7 +225,7 @@ class Chain:
223 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) 225 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
224 226
225 # Save that selection on the mmCIF object s to file 227 # Save that selection on the mmCIF object s to file
226 - ioobj = Bio.PDB.mmcifio.MMCIFIO() 228 + ioobj = pdb.MMCIFIO()
227 ioobj.set_structure(s) 229 ioobj.set_structure(s)
228 ioobj.save(self.file, sel) 230 ioobj.save(self.file, sel)
229 231
...@@ -1115,7 +1117,7 @@ class Pipeline: ...@@ -1115,7 +1117,7 @@ class Pipeline:
1115 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") 1117 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
1116 sys.exit() 1118 sys.exit()
1117 elif opt == '--version': 1119 elif opt == '--version':
1118 - print("RNANet 1.1 beta") 1120 + print("RNANet 1.2, parallelized, Dockerized")
1119 sys.exit() 1121 sys.exit()
1120 elif opt == "-r" or opt == "--resolution": 1122 elif opt == "-r" or opt == "--resolution":
1121 assert float(arg) > 0.0 and float(arg) <= 20.0 1123 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1445,7 +1447,7 @@ class Pipeline: ...@@ -1445,7 +1447,7 @@ class Pipeline:
1445 # Update the database 1447 # Update the database
1446 data = [] 1448 data = []
1447 for r in results: 1449 for r in results:
1448 - align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") 1450 + align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
1449 nb_3d_chains = len([1 for r in align if '[' in r.id]) 1451 nb_3d_chains = len([1 for r in align if '[' in r.id])
1450 if r[0] in SSU_set: # SSU v138 is used 1452 if r[0] in SSU_set: # SSU v138 is used
1451 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ 1453 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/
...@@ -1535,9 +1537,9 @@ class Pipeline: ...@@ -1535,9 +1537,9 @@ class Pipeline:
1535 # Run statistics 1537 # Run statistics
1536 if self.RUN_STATS: 1538 if self.RUN_STATS:
1537 # Remove previous precomputed data 1539 # Remove previous precomputed data
1538 - subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 1540 + subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz",
1539 - runDir + "/data/wadley_kernel_eta_prime.npz", 1541 + runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz",
1540 - runDir + "/data/pair_counts.csv"]) 1542 + runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"])
1541 for f in self.fam_list: 1543 for f in self.fam_list:
1542 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 1544 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy",
1543 runDir + f"/data/{f}_pairs.csv", 1545 runDir + f"/data/{f}_pairs.csv",
...@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): ...@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id):
2124 # if not, read the CIF header and register the structure 2126 # if not, read the CIF header and register the structure
2125 if not len(r): 2127 if not len(r):
2126 # Load the MMCIF file with Biopython 2128 # Load the MMCIF file with Biopython
2127 - mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath) 2129 + mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath)
2128 2130
2129 # Get info about that structure 2131 # Get info about that structure
2130 try: 2132 try:
...@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2218 if rfam_acc in LSU_set | SSU_set: # rRNA 2220 if rfam_acc in LSU_set | SSU_set: # rRNA
2219 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2221 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2220 # Detect doublons and remove them 2222 # Detect doublons and remove them
2221 - existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") 2223 + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
2222 existing_ids = [r.id for r in existing_afa] 2224 existing_ids = [r.id for r in existing_afa]
2223 del existing_afa 2225 del existing_afa
2224 new_ids = [str(c) for c in chains] 2226 new_ids = [str(c) for c in chains]
...@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2227 if len(doublons): 2229 if len(doublons):
2228 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") 2230 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
2229 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" 2231 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
2230 - seqfile = Bio.SeqIO.parse(fasta, "fasta") 2232 + seqfile = SeqIO.parse(fasta, "fasta")
2231 # remove it and rewrite it with its own content filtered 2233 # remove it and rewrite it with its own content filtered
2232 os.remove(fasta) 2234 os.remove(fasta)
2233 with open(fasta, 'w') as f: 2235 with open(fasta, 'w') as f:
...@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2268 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: 2270 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
2269 ids = set() 2271 ids = set()
2270 # Remove doublons from the Rfam hits 2272 # Remove doublons from the Rfam hits
2271 - for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): 2273 + for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
2272 if r.id not in ids: 2274 if r.id not in ids:
2273 ids.add(r.id) 2275 ids.add(r.id)
2274 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') 2276 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
...@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): ...@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc):
2343 notify("Aligned new sequences together") 2345 notify("Aligned new sequences together")
2344 2346
2345 # Detect doublons and remove them 2347 # Detect doublons and remove them
2346 - existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm") 2348 + existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2347 existing_ids = [r.id for r in existing_stk] 2349 existing_ids = [r.id for r in existing_stk]
2348 del existing_stk 2350 del existing_stk
2349 - new_stk = Bio.AlignIO.read(new_ali_path, "stockholm") 2351 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2350 new_ids = [r.id for r in new_stk] 2352 new_ids = [r.id for r in new_stk]
2351 del new_stk 2353 del new_stk
2352 doublons = [i for i in existing_ids if i in new_ids] 2354 doublons = [i for i in existing_ids if i in new_ids]
...@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): ...@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps):
2447 2449
2448 # Open the alignment 2450 # Open the alignment
2449 try: 2451 try:
2450 - align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 2452 + align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
2451 except: 2453 except:
2452 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) 2454 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
2453 with open(runDir + "/errors.txt", "a") as errf: 2455 with open(runDir + "/errors.txt", "a") as errf:
......
This diff is collapsed. Click to expand it.