Showing
2 changed files
with
91 additions
and
75 deletions
1 | #!/usr/bin/python3.8 | 1 | #!/usr/bin/python3.8 |
2 | import Bio | 2 | import Bio |
3 | +import Bio.PDB as pdb | ||
3 | import concurrent.futures | 4 | import concurrent.futures |
4 | import getopt | 5 | import getopt |
5 | import gzip | 6 | import gzip |
... | @@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager | ... | @@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager |
25 | from time import sleep | 26 | from time import sleep |
26 | from tqdm import tqdm | 27 | from tqdm import tqdm |
27 | from setproctitle import setproctitle | 28 | from setproctitle import setproctitle |
28 | - | 29 | +from Bio import AlignIO, SeqIO |
30 | +from Bio.Align import AlignInfo | ||
29 | 31 | ||
30 | def trace_unhandled_exceptions(func): | 32 | def trace_unhandled_exceptions(func): |
31 | @wraps(func) | 33 | @wraps(func) |
... | @@ -112,7 +114,7 @@ class SelectivePortionSelector(object): | ... | @@ -112,7 +114,7 @@ class SelectivePortionSelector(object): |
112 | return 1 | 114 | return 1 |
113 | 115 | ||
114 | 116 | ||
115 | -class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): | 117 | +class BufferingSummaryInfo(AlignInfo.SummaryInfo): |
116 | 118 | ||
117 | def get_pssm(self, family, index): | 119 | def get_pssm(self, family, index): |
118 | """Create a position specific score matrix object for the alignment. | 120 | """Create a position specific score matrix object for the alignment. |
... | @@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): | ... | @@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): |
139 | score_dict[this_residue] = 1.0 | 141 | score_dict[this_residue] = 1.0 |
140 | pssm_info.append(('*', score_dict)) | 142 | pssm_info.append(('*', score_dict)) |
141 | 143 | ||
142 | - return Bio.Align.AlignInfo.PSSM(pssm_info) | 144 | + return AlignInfo.PSSM(pssm_info) |
143 | 145 | ||
144 | 146 | ||
145 | class Chain: | 147 | class Chain: |
... | @@ -198,11 +200,11 @@ class Chain: | ... | @@ -198,11 +200,11 @@ class Chain: |
198 | 200 | ||
199 | with warnings.catch_warnings(): | 201 | with warnings.catch_warnings(): |
200 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. | 202 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. |
201 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) | 203 | + warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning) |
202 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | 204 | + warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning) |
203 | 205 | ||
204 | # Load the whole mmCIF into a Biopython structure object: | 206 | # Load the whole mmCIF into a Biopython structure object: |
205 | - mmcif_parser = Bio.PDB.MMCIFParser() | 207 | + mmcif_parser = pdb.MMCIFParser() |
206 | try: | 208 | try: |
207 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") | 209 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") |
208 | except ValueError as e: | 210 | except ValueError as e: |
... | @@ -223,7 +225,7 @@ class Chain: | ... | @@ -223,7 +225,7 @@ class Chain: |
223 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) | 225 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) |
224 | 226 | ||
225 | # Save that selection on the mmCIF object s to file | 227 | # Save that selection on the mmCIF object s to file |
226 | - ioobj = Bio.PDB.mmcifio.MMCIFIO() | 228 | + ioobj = pdb.MMCIFIO() |
227 | ioobj.set_structure(s) | 229 | ioobj.set_structure(s) |
228 | ioobj.save(self.file, sel) | 230 | ioobj.save(self.file, sel) |
229 | 231 | ||
... | @@ -1115,7 +1117,7 @@ class Pipeline: | ... | @@ -1115,7 +1117,7 @@ class Pipeline: |
1115 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") | 1117 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") |
1116 | sys.exit() | 1118 | sys.exit() |
1117 | elif opt == '--version': | 1119 | elif opt == '--version': |
1118 | - print("RNANet 1.1 beta") | 1120 | + print("RNANet 1.2, parallelized, Dockerized") |
1119 | sys.exit() | 1121 | sys.exit() |
1120 | elif opt == "-r" or opt == "--resolution": | 1122 | elif opt == "-r" or opt == "--resolution": |
1121 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1123 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -1445,7 +1447,7 @@ class Pipeline: | ... | @@ -1445,7 +1447,7 @@ class Pipeline: |
1445 | # Update the database | 1447 | # Update the database |
1446 | data = [] | 1448 | data = [] |
1447 | for r in results: | 1449 | for r in results: |
1448 | - align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") | 1450 | + align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") |
1449 | nb_3d_chains = len([1 for r in align if '[' in r.id]) | 1451 | nb_3d_chains = len([1 for r in align if '[' in r.id]) |
1450 | if r[0] in SSU_set: # SSU v138 is used | 1452 | if r[0] in SSU_set: # SSU v138 is used |
1451 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ | 1453 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ |
... | @@ -1535,9 +1537,9 @@ class Pipeline: | ... | @@ -1535,9 +1537,9 @@ class Pipeline: |
1535 | # Run statistics | 1537 | # Run statistics |
1536 | if self.RUN_STATS: | 1538 | if self.RUN_STATS: |
1537 | # Remove previous precomputed data | 1539 | # Remove previous precomputed data |
1538 | - subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", | 1540 | + subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz", |
1539 | - runDir + "/data/wadley_kernel_eta_prime.npz", | 1541 | + runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz", |
1540 | - runDir + "/data/pair_counts.csv"]) | 1542 | + runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"]) |
1541 | for f in self.fam_list: | 1543 | for f in self.fam_list: |
1542 | subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", | 1544 | subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", |
1543 | runDir + f"/data/{f}_pairs.csv", | 1545 | runDir + f"/data/{f}_pairs.csv", |
... | @@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): | ... | @@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): |
2124 | # if not, read the CIF header and register the structure | 2126 | # if not, read the CIF header and register the structure |
2125 | if not len(r): | 2127 | if not len(r): |
2126 | # Load the MMCIF file with Biopython | 2128 | # Load the MMCIF file with Biopython |
2127 | - mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath) | 2129 | + mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath) |
2128 | 2130 | ||
2129 | # Get info about that structure | 2131 | # Get info about that structure |
2130 | try: | 2132 | try: |
... | @@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2218 | if rfam_acc in LSU_set | SSU_set: # rRNA | 2220 | if rfam_acc in LSU_set | SSU_set: # rRNA |
2219 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2221 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2220 | # Detect doublons and remove them | 2222 | # Detect doublons and remove them |
2221 | - existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | 2223 | + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") |
2222 | existing_ids = [r.id for r in existing_afa] | 2224 | existing_ids = [r.id for r in existing_afa] |
2223 | del existing_afa | 2225 | del existing_afa |
2224 | new_ids = [str(c) for c in chains] | 2226 | new_ids = [str(c) for c in chains] |
... | @@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2227 | if len(doublons): | 2229 | if len(doublons): |
2228 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") | 2230 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") |
2229 | fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" | 2231 | fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" |
2230 | - seqfile = Bio.SeqIO.parse(fasta, "fasta") | 2232 | + seqfile = SeqIO.parse(fasta, "fasta") |
2231 | # remove it and rewrite it with its own content filtered | 2233 | # remove it and rewrite it with its own content filtered |
2232 | os.remove(fasta) | 2234 | os.remove(fasta) |
2233 | with open(fasta, 'w') as f: | 2235 | with open(fasta, 'w') as f: |
... | @@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2268 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: | 2270 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: |
2269 | ids = set() | 2271 | ids = set() |
2270 | # Remove doublons from the Rfam hits | 2272 | # Remove doublons from the Rfam hits |
2271 | - for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): | 2273 | + for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): |
2272 | if r.id not in ids: | 2274 | if r.id not in ids: |
2273 | ids.add(r.id) | 2275 | ids.add(r.id) |
2274 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') | 2276 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') |
... | @@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): | ... | @@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): |
2343 | notify("Aligned new sequences together") | 2345 | notify("Aligned new sequences together") |
2344 | 2346 | ||
2345 | # Detect doublons and remove them | 2347 | # Detect doublons and remove them |
2346 | - existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm") | 2348 | + existing_stk = AlignIO.read(existing_ali_path, "stockholm") |
2347 | existing_ids = [r.id for r in existing_stk] | 2349 | existing_ids = [r.id for r in existing_stk] |
2348 | del existing_stk | 2350 | del existing_stk |
2349 | - new_stk = Bio.AlignIO.read(new_ali_path, "stockholm") | 2351 | + new_stk = AlignIO.read(new_ali_path, "stockholm") |
2350 | new_ids = [r.id for r in new_stk] | 2352 | new_ids = [r.id for r in new_stk] |
2351 | del new_stk | 2353 | del new_stk |
2352 | doublons = [i for i in existing_ids if i in new_ids] | 2354 | doublons = [i for i in existing_ids if i in new_ids] |
... | @@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): |
2447 | 2449 | ||
2448 | # Open the alignment | 2450 | # Open the alignment |
2449 | try: | 2451 | try: |
2450 | - align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2452 | + align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
2451 | except: | 2453 | except: |
2452 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) | 2454 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) |
2453 | with open(runDir + "/errors.txt", "a") as errf: | 2455 | with open(runDir + "/errors.txt", "a") as errf: | ... | ... |
... | @@ -70,7 +70,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): | ... | @@ -70,7 +70,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): |
70 | thr_idx = idxQueue.get() | 70 | thr_idx = idxQueue.get() |
71 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})") | 71 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})") |
72 | 72 | ||
73 | - pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) | 73 | + pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", unit="kernel", position=thr_idx+1, leave=False) |
74 | 74 | ||
75 | # Extract the angle values of c2'-endo and c3'-endo nucleotides | 75 | # Extract the angle values of c2'-endo and c3'-endo nucleotides |
76 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 76 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
... | @@ -203,25 +203,10 @@ def stats_len(): | ... | @@ -203,25 +203,10 @@ def stats_len(): |
203 | global idxQueue | 203 | global idxQueue |
204 | thr_idx = idxQueue.get() | 204 | thr_idx = idxQueue.get() |
205 | 205 | ||
206 | - # sort the RNA families so that the plot is readable | ||
207 | - def family_order(f): | ||
208 | - if f in LSU_set: | ||
209 | - return 4 | ||
210 | - elif f in SSU_set: | ||
211 | - return 3 | ||
212 | - elif f in ["RF00001"]: # | ||
213 | - return 1 # put tRNAs and 5S rRNAs first, | ||
214 | - elif f in ["RF00005"]: # because of the logarithmic scale, otherwise, they look tiny | ||
215 | - return 0 # | ||
216 | - else: | ||
217 | - return 2 | ||
218 | - | ||
219 | - fam_list.sort(key=family_order) | ||
220 | - | ||
221 | cols = [] | 206 | cols = [] |
222 | lengths = [] | 207 | lengths = [] |
223 | 208 | ||
224 | - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False): | 209 | + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", unit="family", leave=False): |
225 | 210 | ||
226 | # Define a color for that family in the plot | 211 | # Define a color for that family in the plot |
227 | if f in LSU_set: | 212 | if f in LSU_set: |
... | @@ -249,7 +234,7 @@ def stats_len(): | ... | @@ -249,7 +234,7 @@ def stats_len(): |
249 | # Plot the figure | 234 | # Plot the figure |
250 | fig = plt.figure(figsize=(10,3)) | 235 | fig = plt.figure(figsize=(10,3)) |
251 | ax = fig.gca() | 236 | ax = fig.gca() |
252 | - ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=fam_list) | 237 | + ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=famlist) |
253 | ax.set_xlabel("Sequence length (nucleotides)", fontsize=8) | 238 | ax.set_xlabel("Sequence length (nucleotides)", fontsize=8) |
254 | ax.set_ylabel("Number of 3D chains", fontsize=8) | 239 | ax.set_ylabel("Number of 3D chains", fontsize=8) |
255 | ax.set_xlim(left=-150) | 240 | ax.set_xlim(left=-150) |
... | @@ -303,18 +288,18 @@ def stats_freq(): | ... | @@ -303,18 +288,18 @@ def stats_freq(): |
303 | 288 | ||
304 | # Initialize a Counter object for each family | 289 | # Initialize a Counter object for each family |
305 | freqs = {} | 290 | freqs = {} |
306 | - for f in fam_list: | 291 | + for f in famlist: |
307 | freqs[f] = Counter() | 292 | freqs[f] = Counter() |
308 | 293 | ||
309 | # List all nt_names happening within a RNA family and store the counts in the Counter | 294 | # List all nt_names happening within a RNA family and store the counts in the Counter |
310 | - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): | 295 | + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False): |
311 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 296 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
312 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) | 297 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) |
313 | freqs[f].update(counts) | 298 | freqs[f].update(counts) |
314 | 299 | ||
315 | # Create a pandas DataFrame, and save it to CSV. | 300 | # Create a pandas DataFrame, and save it to CSV. |
316 | df = pd.DataFrame() | 301 | df = pd.DataFrame() |
317 | - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): | 302 | + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False): |
318 | tot = sum(freqs[f].values()) | 303 | tot = sum(freqs[f].values()) |
319 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) | 304 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) |
320 | df = df.fillna(0) | 305 | df = df.fillna(0) |
... | @@ -322,12 +307,13 @@ def stats_freq(): | ... | @@ -322,12 +307,13 @@ def stats_freq(): |
322 | idxQueue.put(thr_idx) # replace the thread index in the queue | 307 | idxQueue.put(thr_idx) # replace the thread index in the queue |
323 | # notify("Saved nucleotide frequencies to CSV file.") | 308 | # notify("Saved nucleotide frequencies to CSV file.") |
324 | 309 | ||
310 | +@trace_unhandled_exceptions | ||
325 | def parallel_stats_pairs(f): | 311 | def parallel_stats_pairs(f): |
326 | """Counts occurrences of intra-chain base-pair types in one RNA family | 312 | """Counts occurrences of intra-chain base-pair types in one RNA family |
327 | 313 | ||
328 | REQUIRES tables chain, nucleotide up-to-date.""" | 314 | REQUIRES tables chain, nucleotide up-to-date.""" |
329 | 315 | ||
330 | - if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"): | 316 | + if path.isfile(runDir + "/data/"+f+"_pairs.csv") and path.isfile(runDir + "/data/"+f+"_counts.csv"): |
331 | return | 317 | return |
332 | 318 | ||
333 | # Get a worker number to position the progress bar | 319 | # Get a worker number to position the progress bar |
... | @@ -339,7 +325,7 @@ def parallel_stats_pairs(f): | ... | @@ -339,7 +325,7 @@ def parallel_stats_pairs(f): |
339 | chain_id_list = mappings_list[f] | 325 | chain_id_list = mappings_list[f] |
340 | data = [] | 326 | data = [] |
341 | sqldata = [] | 327 | sqldata = [] |
342 | - for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): | 328 | + for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False): |
343 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 329 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
344 | # Get comma separated lists of basepairs per nucleotide | 330 | # Get comma separated lists of basepairs per nucleotide |
345 | interactions = pd.DataFrame( | 331 | interactions = pd.DataFrame( |
... | @@ -430,16 +416,19 @@ def parallel_stats_pairs(f): | ... | @@ -430,16 +416,19 @@ def parallel_stats_pairs(f): |
430 | 416 | ||
431 | idxQueue.put(thr_idx) # replace the thread index in the queue | 417 | idxQueue.put(thr_idx) # replace the thread index in the queue |
432 | 418 | ||
433 | -def to_dist_matrix(f): | 419 | +def to_id_matrix(f): |
420 | + """ | ||
421 | + Extracts sequences of 3D chains from the family alignments to a distinct STK file, | ||
422 | + then runs esl-alipid on it to get an identity matrix | ||
423 | + """ | ||
434 | if path.isfile("data/"+f+".npy"): | 424 | if path.isfile("data/"+f+".npy"): |
435 | - # notify(f"Computed {f} distance matrix", "loaded from file") | ||
436 | return 0 | 425 | return 0 |
437 | 426 | ||
438 | # Get a worker number to position the progress bar | 427 | # Get a worker number to position the progress bar |
439 | global idxQueue | 428 | global idxQueue |
440 | thr_idx = idxQueue.get() | 429 | thr_idx = idxQueue.get() |
441 | 430 | ||
442 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})") | 431 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})") |
443 | 432 | ||
444 | # Prepare a file | 433 | # Prepare a file |
445 | with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: | 434 | with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: |
... | @@ -452,14 +441,16 @@ def to_dist_matrix(f): | ... | @@ -452,14 +441,16 @@ def to_dist_matrix(f): |
452 | except ValueError as e: | 441 | except ValueError as e: |
453 | warn(e) | 442 | warn(e) |
454 | del al | 443 | del al |
455 | - subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) | 444 | + subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", # |
445 | + "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps | ||
446 | + "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) # | ||
447 | + subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"]) | ||
456 | 448 | ||
457 | # Prepare the job | 449 | # Prepare the job |
458 | process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), | 450 | process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), |
459 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 451 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
460 | id_matrix = np.zeros((len(names), len(names))) | 452 | id_matrix = np.zeros((len(names), len(names))) |
461 | - | 453 | + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False) |
462 | - pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False) | ||
463 | cnt = 0 | 454 | cnt = 0 |
464 | while not cnt or process.poll() is None: | 455 | while not cnt or process.poll() is None: |
465 | output = process.stdout.read() | 456 | output = process.stdout.read() |
... | @@ -482,8 +473,8 @@ def to_dist_matrix(f): | ... | @@ -482,8 +473,8 @@ def to_dist_matrix(f): |
482 | warn("\n".join([ line.decode('utf-8') for line in l ])) | 473 | warn("\n".join([ line.decode('utf-8') for line in l ])) |
483 | pbar.close() | 474 | pbar.close() |
484 | 475 | ||
485 | - subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"]) | ||
486 | np.save("data/"+f+".npy", id_matrix) | 476 | np.save("data/"+f+".npy", id_matrix) |
477 | + | ||
487 | idxQueue.put(thr_idx) # replace the thread index in the queue | 478 | idxQueue.put(thr_idx) # replace the thread index in the queue |
488 | return 0 | 479 | return 0 |
489 | 480 | ||
... | @@ -545,7 +536,7 @@ def seq_idty(): | ... | @@ -545,7 +536,7 @@ def seq_idty(): |
545 | fig.tight_layout() | 536 | fig.tight_layout() |
546 | fig.subplots_adjust(hspace=0.3, wspace=0.1) | 537 | fig.subplots_adjust(hspace=0.3, wspace=0.1) |
547 | fig.colorbar(im, ax=axs[-4], shrink=0.8) | 538 | fig.colorbar(im, ax=axs[-4], shrink=0.8) |
548 | - fig.savefig(runDir + f"/results/figures/distances.png") | 539 | + fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png") |
549 | print("> Computed all identity matrices and saved the figure.", flush=True) | 540 | print("> Computed all identity matrices and saved the figure.", flush=True) |
550 | 541 | ||
551 | def stats_pairs(): | 542 | def stats_pairs(): |
... | @@ -559,10 +550,10 @@ def stats_pairs(): | ... | @@ -559,10 +550,10 @@ def stats_pairs(): |
559 | def line_format(family_data): | 550 | def line_format(family_data): |
560 | return family_data.apply(partial(format_percentage, sum(family_data))) | 551 | return family_data.apply(partial(format_percentage, sum(family_data))) |
561 | 552 | ||
562 | - if not path.isfile("data/pair_counts.csv"): | 553 | + if not path.isfile("data/pair_counts_{res_thr}.csv"): |
563 | results = [] | 554 | results = [] |
564 | allpairs = [] | 555 | allpairs = [] |
565 | - for f in fam_list: | 556 | + for f in famlist: |
566 | newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0) | 557 | newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0) |
567 | fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0) | 558 | fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0) |
568 | results.append(fam_df) | 559 | results.append(fam_df) |
... | @@ -571,11 +562,11 @@ def stats_pairs(): | ... | @@ -571,11 +562,11 @@ def stats_pairs(): |
571 | subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"]) | 562 | subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"]) |
572 | all_pairs = pd.concat(allpairs) | 563 | all_pairs = pd.concat(allpairs) |
573 | df = pd.concat(results).fillna(0) | 564 | df = pd.concat(results).fillna(0) |
574 | - df.to_csv("data/pair_counts.csv") | 565 | + df.to_csv(runDir + f"/data/pair_counts_{res_thr}.csv") |
575 | - all_pairs.to_csv("data/all_pairs.csv") | 566 | + all_pairs.to_csv(runDir + f"/data/all_pairs_{res_thr}.csv") |
576 | else: | 567 | else: |
577 | - df = pd.read_csv("data/pair_counts.csv", index_col=0) | 568 | + df = pd.read_csv(runDir + f"/data/pair_counts_{res_thr}.csv", index_col=0) |
578 | - all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0) | 569 | + all_pairs = pd.read_csv(runDir + f"/data/all_pairs_{res_thr}.csv", index_col=0) |
579 | 570 | ||
580 | crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair) | 571 | crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair) |
581 | col_list = [ x for x in df.columns if '.' in x ] | 572 | col_list = [ x for x in df.columns if '.' in x ] |
... | @@ -613,7 +604,7 @@ def stats_pairs(): | ... | @@ -613,7 +604,7 @@ def stats_pairs(): |
613 | ax.set_ylabel("Number of observations (millions)", fontsize=13) | 604 | ax.set_ylabel("Number of observations (millions)", fontsize=13) |
614 | ax.set_xlabel(None) | 605 | ax.set_xlabel(None) |
615 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) | 606 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) |
616 | - plt.savefig(runDir + "/results/figures/pairings.png") | 607 | + plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") |
617 | 608 | ||
618 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 609 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
619 | 610 | ||
... | @@ -916,8 +907,24 @@ def log_to_pbar(pbar): | ... | @@ -916,8 +907,24 @@ def log_to_pbar(pbar): |
916 | pbar.update(1) | 907 | pbar.update(1) |
917 | return update | 908 | return update |
918 | 909 | ||
910 | +def family_order(f): | ||
911 | + # sort the RNA families so that the plots are readable | ||
912 | + | ||
913 | + if f in LSU_set: | ||
914 | + return 4 | ||
915 | + elif f in SSU_set: | ||
916 | + return 3 | ||
917 | + elif f in ["RF00001"]: # | ||
918 | + return 1 # put tRNAs and 5S rRNAs first, | ||
919 | + elif f in ["RF00005"]: # because of the logarithmic scale of the lengths' figure, otherwise, they look tiny | ||
920 | + return 0 # | ||
921 | + else: | ||
922 | + return 2 | ||
923 | + | ||
919 | if __name__ == "__main__": | 924 | if __name__ == "__main__": |
920 | 925 | ||
926 | + os.makedirs(runDir + "/results/figures/", exist_ok=True) | ||
927 | + | ||
921 | # parse options | 928 | # parse options |
922 | DELETE_OLD_DATA = False | 929 | DELETE_OLD_DATA = False |
923 | DO_WADLEY_ANALYSIS = False | 930 | DO_WADLEY_ANALYSIS = False |
... | @@ -943,7 +950,7 @@ if __name__ == "__main__": | ... | @@ -943,7 +950,7 @@ if __name__ == "__main__": |
943 | print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") | 950 | print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") |
944 | sys.exit() | 951 | sys.exit() |
945 | elif opt == '--version': | 952 | elif opt == '--version': |
946 | - print("RNANet statistics 1.1 beta") | 953 | + print("RNANet statistics 1.2") |
947 | sys.exit() | 954 | sys.exit() |
948 | elif opt == "-r" or opt == "--resolution": | 955 | elif opt == "-r" or opt == "--resolution": |
949 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 956 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -959,31 +966,38 @@ if __name__ == "__main__": | ... | @@ -959,31 +966,38 @@ if __name__ == "__main__": |
959 | elif opt=='--from-scratch': | 966 | elif opt=='--from-scratch': |
960 | DELETE_OLD_DATA = True | 967 | DELETE_OLD_DATA = True |
961 | DO_WADLEY_ANALYSIS = True | 968 | DO_WADLEY_ANALYSIS = True |
962 | - subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) | ||
963 | elif opt=='--wadley': | 969 | elif opt=='--wadley': |
964 | DO_WADLEY_ANALYSIS = True | 970 | DO_WADLEY_ANALYSIS = True |
965 | 971 | ||
966 | 972 | ||
967 | - # Load mappings | 973 | + # Load mappings. famlist will contain only families with structures at this resolution threshold. |
968 | print("Loading mappings list...") | 974 | print("Loading mappings list...") |
969 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 975 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
970 | - fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] | ||
971 | - mappings_list = {} | ||
972 | - for k in fam_list: | ||
973 | - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ] | ||
974 | - | ||
975 | - # List the families for which we will compute sequence identity matrices | ||
976 | - with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
977 | - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ] | ||
978 | - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ] | ||
979 | n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] | 976 | n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] |
977 | + families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains | ||
978 | + FROM chain JOIN structure | ||
979 | + ON chain.structure_id = structure.pdb_id | ||
980 | + WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd' | ||
981 | + GROUP BY rfam_acc; | ||
982 | + """, conn) | ||
983 | + families.drop(families[families.n_chains == 0].index, inplace=True) | ||
984 | + mappings_list = {} | ||
985 | + for k in families.rfam_acc: | ||
986 | + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id | ||
987 | + FROM chain JOIN structure ON chain.structure_id=structure.pdb_id | ||
988 | + WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ] | ||
989 | + famlist = families.rfam_acc.tolist() | ||
990 | + ignored = families[families.n_chains < 3].rfam_acc.tolist() | ||
991 | + famlist.sort(key=family_order) | ||
992 | + print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") | ||
980 | if len(ignored): | 993 | if len(ignored): |
981 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | 994 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') |
982 | 995 | ||
983 | if DELETE_OLD_DATA: | 996 | if DELETE_OLD_DATA: |
984 | - for f in fam_list: | 997 | + for f in famlist: |
985 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) | 998 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) |
986 | - | 999 | + if DO_WADLEY_ANALYSIS: |
1000 | + subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"]) | ||
987 | 1001 | ||
988 | # Prepare the multiprocessing execution environment | 1002 | # Prepare the multiprocessing execution environment |
989 | nworkers = min(read_cpu_number()-1, 32) | 1003 | nworkers = min(read_cpu_number()-1, 32) |
... | @@ -995,17 +1009,17 @@ if __name__ == "__main__": | ... | @@ -995,17 +1009,17 @@ if __name__ == "__main__": |
995 | # Define the tasks | 1009 | # Define the tasks |
996 | joblist = [] | 1010 | joblist = [] |
997 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 1011 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
998 | - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default | 1012 | + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
999 | - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) # | 1013 | + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
1000 | joblist.append(Job(function=stats_len)) # Computes figures | 1014 | joblist.append(Job(function=stats_len)) # Computes figures |
1001 | joblist.append(Job(function=stats_freq)) # updates the database | 1015 | joblist.append(Job(function=stats_freq)) # updates the database |
1002 | for f in famlist: | 1016 | for f in famlist: |
1003 | joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database | 1017 | joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database |
1004 | if f not in ignored: | 1018 | if f not in ignored: |
1005 | - joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database | 1019 | + joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database |
1006 | 1020 | ||
1007 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1021 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) |
1008 | - pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) | 1022 | + pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) |
1009 | 1023 | ||
1010 | try: | 1024 | try: |
1011 | for j in joblist: | 1025 | for j in joblist: | ... | ... |
-
Please register or login to post a comment