Showing
2 changed files
with
21 additions
and
19 deletions
1 | #!/usr/bin/python3.8 | 1 | #!/usr/bin/python3.8 |
2 | import Bio | 2 | import Bio |
3 | +import Bio.PDB as pdb | ||
3 | import concurrent.futures | 4 | import concurrent.futures |
4 | import getopt | 5 | import getopt |
5 | import gzip | 6 | import gzip |
... | @@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager | ... | @@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager |
25 | from time import sleep | 26 | from time import sleep |
26 | from tqdm import tqdm | 27 | from tqdm import tqdm |
27 | from setproctitle import setproctitle | 28 | from setproctitle import setproctitle |
28 | - | 29 | +from Bio import AlignIO, SeqIO |
30 | +from Bio.Align import AlignInfo | ||
29 | 31 | ||
30 | def trace_unhandled_exceptions(func): | 32 | def trace_unhandled_exceptions(func): |
31 | @wraps(func) | 33 | @wraps(func) |
... | @@ -112,7 +114,7 @@ class SelectivePortionSelector(object): | ... | @@ -112,7 +114,7 @@ class SelectivePortionSelector(object): |
112 | return 1 | 114 | return 1 |
113 | 115 | ||
114 | 116 | ||
115 | -class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): | 117 | +class BufferingSummaryInfo(AlignInfo.SummaryInfo): |
116 | 118 | ||
117 | def get_pssm(self, family, index): | 119 | def get_pssm(self, family, index): |
118 | """Create a position specific score matrix object for the alignment. | 120 | """Create a position specific score matrix object for the alignment. |
... | @@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): | ... | @@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): |
139 | score_dict[this_residue] = 1.0 | 141 | score_dict[this_residue] = 1.0 |
140 | pssm_info.append(('*', score_dict)) | 142 | pssm_info.append(('*', score_dict)) |
141 | 143 | ||
142 | - return Bio.Align.AlignInfo.PSSM(pssm_info) | 144 | + return AlignInfo.PSSM(pssm_info) |
143 | 145 | ||
144 | 146 | ||
145 | class Chain: | 147 | class Chain: |
... | @@ -198,11 +200,11 @@ class Chain: | ... | @@ -198,11 +200,11 @@ class Chain: |
198 | 200 | ||
199 | with warnings.catch_warnings(): | 201 | with warnings.catch_warnings(): |
200 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. | 202 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. |
201 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) | 203 | + warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning) |
202 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | 204 | + warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning) |
203 | 205 | ||
204 | # Load the whole mmCIF into a Biopython structure object: | 206 | # Load the whole mmCIF into a Biopython structure object: |
205 | - mmcif_parser = Bio.PDB.MMCIFParser() | 207 | + mmcif_parser = pdb.MMCIFParser() |
206 | try: | 208 | try: |
207 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") | 209 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") |
208 | except ValueError as e: | 210 | except ValueError as e: |
... | @@ -223,7 +225,7 @@ class Chain: | ... | @@ -223,7 +225,7 @@ class Chain: |
223 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) | 225 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) |
224 | 226 | ||
225 | # Save that selection on the mmCIF object s to file | 227 | # Save that selection on the mmCIF object s to file |
226 | - ioobj = Bio.PDB.mmcifio.MMCIFIO() | 228 | + ioobj = pdb.MMCIFIO() |
227 | ioobj.set_structure(s) | 229 | ioobj.set_structure(s) |
228 | ioobj.save(self.file, sel) | 230 | ioobj.save(self.file, sel) |
229 | 231 | ||
... | @@ -1115,7 +1117,7 @@ class Pipeline: | ... | @@ -1115,7 +1117,7 @@ class Pipeline: |
1115 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") | 1117 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") |
1116 | sys.exit() | 1118 | sys.exit() |
1117 | elif opt == '--version': | 1119 | elif opt == '--version': |
1118 | - print("RNANet 1.1 beta") | 1120 | + print("RNANet 1.2, parallelized, Dockerized") |
1119 | sys.exit() | 1121 | sys.exit() |
1120 | elif opt == "-r" or opt == "--resolution": | 1122 | elif opt == "-r" or opt == "--resolution": |
1121 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1123 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -1445,7 +1447,7 @@ class Pipeline: | ... | @@ -1445,7 +1447,7 @@ class Pipeline: |
1445 | # Update the database | 1447 | # Update the database |
1446 | data = [] | 1448 | data = [] |
1447 | for r in results: | 1449 | for r in results: |
1448 | - align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") | 1450 | + align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") |
1449 | nb_3d_chains = len([1 for r in align if '[' in r.id]) | 1451 | nb_3d_chains = len([1 for r in align if '[' in r.id]) |
1450 | if r[0] in SSU_set: # SSU v138 is used | 1452 | if r[0] in SSU_set: # SSU v138 is used |
1451 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ | 1453 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ |
... | @@ -1535,9 +1537,9 @@ class Pipeline: | ... | @@ -1535,9 +1537,9 @@ class Pipeline: |
1535 | # Run statistics | 1537 | # Run statistics |
1536 | if self.RUN_STATS: | 1538 | if self.RUN_STATS: |
1537 | # Remove previous precomputed data | 1539 | # Remove previous precomputed data |
1538 | - subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", | 1540 | + subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz", |
1539 | - runDir + "/data/wadley_kernel_eta_prime.npz", | 1541 | + runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz", |
1540 | - runDir + "/data/pair_counts.csv"]) | 1542 | + runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"]) |
1541 | for f in self.fam_list: | 1543 | for f in self.fam_list: |
1542 | subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", | 1544 | subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", |
1543 | runDir + f"/data/{f}_pairs.csv", | 1545 | runDir + f"/data/{f}_pairs.csv", |
... | @@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): | ... | @@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): |
2124 | # if not, read the CIF header and register the structure | 2126 | # if not, read the CIF header and register the structure |
2125 | if not len(r): | 2127 | if not len(r): |
2126 | # Load the MMCIF file with Biopython | 2128 | # Load the MMCIF file with Biopython |
2127 | - mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath) | 2129 | + mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath) |
2128 | 2130 | ||
2129 | # Get info about that structure | 2131 | # Get info about that structure |
2130 | try: | 2132 | try: |
... | @@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2218 | if rfam_acc in LSU_set | SSU_set: # rRNA | 2220 | if rfam_acc in LSU_set | SSU_set: # rRNA |
2219 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2221 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2220 | # Detect doublons and remove them | 2222 | # Detect doublons and remove them |
2221 | - existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | 2223 | + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") |
2222 | existing_ids = [r.id for r in existing_afa] | 2224 | existing_ids = [r.id for r in existing_afa] |
2223 | del existing_afa | 2225 | del existing_afa |
2224 | new_ids = [str(c) for c in chains] | 2226 | new_ids = [str(c) for c in chains] |
... | @@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2227 | if len(doublons): | 2229 | if len(doublons): |
2228 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") | 2230 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") |
2229 | fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" | 2231 | fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" |
2230 | - seqfile = Bio.SeqIO.parse(fasta, "fasta") | 2232 | + seqfile = SeqIO.parse(fasta, "fasta") |
2231 | # remove it and rewrite it with its own content filtered | 2233 | # remove it and rewrite it with its own content filtered |
2232 | os.remove(fasta) | 2234 | os.remove(fasta) |
2233 | with open(fasta, 'w') as f: | 2235 | with open(fasta, 'w') as f: |
... | @@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2268 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: | 2270 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: |
2269 | ids = set() | 2271 | ids = set() |
2270 | # Remove doublons from the Rfam hits | 2272 | # Remove doublons from the Rfam hits |
2271 | - for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): | 2273 | + for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): |
2272 | if r.id not in ids: | 2274 | if r.id not in ids: |
2273 | ids.add(r.id) | 2275 | ids.add(r.id) |
2274 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') | 2276 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') |
... | @@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): | ... | @@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): |
2343 | notify("Aligned new sequences together") | 2345 | notify("Aligned new sequences together") |
2344 | 2346 | ||
2345 | # Detect doublons and remove them | 2347 | # Detect doublons and remove them |
2346 | - existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm") | 2348 | + existing_stk = AlignIO.read(existing_ali_path, "stockholm") |
2347 | existing_ids = [r.id for r in existing_stk] | 2349 | existing_ids = [r.id for r in existing_stk] |
2348 | del existing_stk | 2350 | del existing_stk |
2349 | - new_stk = Bio.AlignIO.read(new_ali_path, "stockholm") | 2351 | + new_stk = AlignIO.read(new_ali_path, "stockholm") |
2350 | new_ids = [r.id for r in new_stk] | 2352 | new_ids = [r.id for r in new_stk] |
2351 | del new_stk | 2353 | del new_stk |
2352 | doublons = [i for i in existing_ids if i in new_ids] | 2354 | doublons = [i for i in existing_ids if i in new_ids] |
... | @@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): |
2447 | 2449 | ||
2448 | # Open the alignment | 2450 | # Open the alignment |
2449 | try: | 2451 | try: |
2450 | - align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2452 | + align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
2451 | except: | 2453 | except: |
2452 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) | 2454 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) |
2453 | with open(runDir + "/errors.txt", "a") as errf: | 2455 | with open(runDir + "/errors.txt", "a") as errf: | ... | ... |
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment