Showing
2 changed files
with
20 additions
and
9 deletions
... | @@ -436,14 +436,14 @@ class Chain: | ... | @@ -436,14 +436,14 @@ class Chain: |
436 | return | 436 | return |
437 | 437 | ||
438 | # Creating a df for easy saving to CSV | 438 | # Creating a df for easy saving to CSV |
439 | - df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam}.csv") | 439 | + df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam_fam}.csv") |
440 | del df | 440 | del df |
441 | print("\t> Saved", self.chain_label, f"annotations to CSV.\t\t{validsymb}", flush=True) | 441 | print("\t> Saved", self.chain_label, f"annotations to CSV.\t\t{validsymb}", flush=True) |
442 | else: | 442 | else: |
443 | print("\t> Computing", self.chain_label, f"annotations...\t{validsymb}\t(already done)", flush=True) | 443 | print("\t> Computing", self.chain_label, f"annotations...\t{validsymb}\t(already done)", flush=True) |
444 | 444 | ||
445 | # Now load data from the CSV file | 445 | # Now load data from the CSV file |
446 | - d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam}.csv", index_col=0) | 446 | + d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam_fam}.csv", index_col=0) |
447 | self.seq = "".join(d.nt_code.values) | 447 | self.seq = "".join(d.nt_code.values) |
448 | self.aligned_seq = "".join(d.nt_align_code.values) | 448 | self.aligned_seq = "".join(d.nt_align_code.values) |
449 | self.length = len([ x for x in self.aligned_seq if x != "-" ]) | 449 | self.length = len([ x for x in self.aligned_seq if x != "-" ]) |
... | @@ -561,11 +561,9 @@ class Chain: | ... | @@ -561,11 +561,9 @@ class Chain: |
561 | 'alpha','beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi', | 561 | 'alpha','beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi', |
562 | 'bb_type','glyco_bond','form','ssZp','Dp', | 562 | 'bb_type','glyco_bond','form','ssZp','Dp', |
563 | 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', | 563 | 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', |
564 | - 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering', | 564 | + 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering' |
565 | - 'P_x','P_y','P_z','C5prime_x','C5prime_y','C5prime_z' | ||
566 | ] | 565 | ] |
567 | self.data = self.data[cols] | 566 | self.data = self.data[cols] |
568 | - self.save() # save to file | ||
569 | 567 | ||
570 | def save(self, fformat = "csv"): | 568 | def save(self, fformat = "csv"): |
571 | # save to file | 569 | # save to file |
... | @@ -1310,6 +1308,7 @@ def alignment_nt_stats(f): | ... | @@ -1310,6 +1308,7 @@ def alignment_nt_stats(f): |
1310 | # Compute statistics per column | 1308 | # Compute statistics per column |
1311 | pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) | 1309 | pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) |
1312 | frequencies = np.array([ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]).T | 1310 | frequencies = np.array([ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]).T |
1311 | + del pssm | ||
1313 | 1312 | ||
1314 | # For each sequence, find the right chain and save the PSSMs inside. | 1313 | # For each sequence, find the right chain and save the PSSMs inside. |
1315 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False) | 1314 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False) |
... | @@ -1320,11 +1319,18 @@ def alignment_nt_stats(f): | ... | @@ -1320,11 +1319,18 @@ def alignment_nt_stats(f): |
1320 | 1319 | ||
1321 | # get the right 3D chain: | 1320 | # get the right 3D chain: |
1322 | idx = chains_ids.index(s.id) | 1321 | idx = chains_ids.index(s.id) |
1322 | + | ||
1323 | + # call its method to set its frequencies, and save it | ||
1323 | list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies) | 1324 | list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies) |
1325 | + list_of_chains[idx].save(fformat='csv') | ||
1326 | + | ||
1327 | + del list_of_chains[idx] # saves a bit of memory because of the Chain object sizes | ||
1328 | + del chains_ids[idx] # to keep indexes aligned with list_of_chains | ||
1324 | pbar.update(1) | 1329 | pbar.update(1) |
1325 | - pbar.close() | ||
1326 | 1330 | ||
1331 | + pbar.close() | ||
1327 | 1332 | ||
1333 | + del rfam_acc_to_download[f] # We won't need this family's chain objects anymore, free up | ||
1328 | idxQueue.put(thr_idx) # replace the thread index in the queue | 1334 | idxQueue.put(thr_idx) # replace the thread index in the queue |
1329 | return 0 | 1335 | return 0 |
1330 | 1336 | ||
... | @@ -1551,7 +1557,8 @@ if __name__ == "__main__": | ... | @@ -1551,7 +1557,8 @@ if __name__ == "__main__": |
1551 | pdb_chain_id = nr[2].upper() | 1557 | pdb_chain_id = nr[2].upper() |
1552 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" | 1558 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" |
1553 | all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) | 1559 | all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) |
1554 | - | 1560 | + |
1561 | + del full_structures_list | ||
1555 | n_chains = len(all_chains) | 1562 | n_chains = len(all_chains) |
1556 | print(">", validsymb, n_chains, "RNA chains of interest.") | 1563 | print(">", validsymb, n_chains, "RNA chains of interest.") |
1557 | 1564 | ||
... | @@ -1586,6 +1593,8 @@ if __name__ == "__main__": | ... | @@ -1586,6 +1593,8 @@ if __name__ == "__main__": |
1586 | 1593 | ||
1587 | print(f"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors).") | 1594 | print(f"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors).") |
1588 | del all_chains # Here ends its utility, so let's free some memory | 1595 | del all_chains # Here ends its utility, so let's free some memory |
1596 | + del joblist | ||
1597 | + del results | ||
1589 | 1598 | ||
1590 | if not HOMOLOGY: | 1599 | if not HOMOLOGY: |
1591 | # Save chains to file | 1600 | # Save chains to file |
... | @@ -1613,7 +1622,7 @@ if __name__ == "__main__": | ... | @@ -1613,7 +1622,7 @@ if __name__ == "__main__": |
1613 | rfam_acc_to_download[c.rfam_fam].append(c) | 1622 | rfam_acc_to_download[c.rfam_fam].append(c) |
1614 | mappings_list[c.rfam_fam].append(c.chain_label) | 1623 | mappings_list[c.rfam_fam].append(c.chain_label) |
1615 | pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv") | 1624 | pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv") |
1616 | - exit() | 1625 | + del mappings_list |
1617 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:") | 1626 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:") |
1618 | 1627 | ||
1619 | # Download the covariance models for all families | 1628 | # Download the covariance models for all families |
... | @@ -1632,6 +1641,7 @@ if __name__ == "__main__": | ... | @@ -1632,6 +1641,7 @@ if __name__ == "__main__": |
1632 | for f in fam_list: | 1641 | for f in fam_list: |
1633 | line = fam_stats[fam_stats["rfam_acc"]==f] | 1642 | line = fam_stats[fam_stats["rfam_acc"]==f] |
1634 | print(f"\t> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign") | 1643 | print(f"\t> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign") |
1644 | + del fam_stats | ||
1635 | 1645 | ||
1636 | # Download the sequences | 1646 | # Download the sequences |
1637 | for f in fam_list: | 1647 | for f in fam_list: |
... | @@ -1650,6 +1660,7 @@ if __name__ == "__main__": | ... | @@ -1650,6 +1660,7 @@ if __name__ == "__main__": |
1650 | 1660 | ||
1651 | # Execute the jobs | 1661 | # Execute the jobs |
1652 | execute_joblist(fulljoblist, printstats=True) # printstats=True will show a summary of time/memory usage of the jobs | 1662 | execute_joblist(fulljoblist, printstats=True) # printstats=True will show a summary of time/memory usage of the jobs |
1663 | + del fulljoblist | ||
1653 | 1664 | ||
1654 | # ========================================================================================== | 1665 | # ========================================================================================== |
1655 | # Now compute statistics on base variants at each position of every 3D chain | 1666 | # Now compute statistics on base variants at each position of every 3D chain |
... | @@ -1669,7 +1680,7 @@ if __name__ == "__main__": | ... | @@ -1669,7 +1680,7 @@ if __name__ == "__main__": |
1669 | 1680 | ||
1670 | # Start a process pool to dispatch the RNA families, | 1681 | # Start a process pool to dispatch the RNA families, |
1671 | # over multiple CPUs (one family by CPU) | 1682 | # over multiple CPUs (one family by CPU) |
1672 | - p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=ncores) | 1683 | + p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=int(ncores/2)) |
1673 | 1684 | ||
1674 | fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True) | 1685 | fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True) |
1675 | for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family | 1686 | for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family | ... | ... |
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment