Louis BECQUEY

Lower RAM usage

...@@ -436,14 +436,14 @@ class Chain: ...@@ -436,14 +436,14 @@ class Chain:
436 return 436 return
437 437
438 # Creating a df for easy saving to CSV 438 # Creating a df for easy saving to CSV
439 - df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam}.csv") 439 + df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam_fam}.csv")
440 del df 440 del df
441 print("\t> Saved", self.chain_label, f"annotations to CSV.\t\t{validsymb}", flush=True) 441 print("\t> Saved", self.chain_label, f"annotations to CSV.\t\t{validsymb}", flush=True)
442 else: 442 else:
443 print("\t> Computing", self.chain_label, f"annotations...\t{validsymb}\t(already done)", flush=True) 443 print("\t> Computing", self.chain_label, f"annotations...\t{validsymb}\t(already done)", flush=True)
444 444
445 # Now load data from the CSV file 445 # Now load data from the CSV file
446 - d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam}.csv", index_col=0) 446 + d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam_fam}.csv", index_col=0)
447 self.seq = "".join(d.nt_code.values) 447 self.seq = "".join(d.nt_code.values)
448 self.aligned_seq = "".join(d.nt_align_code.values) 448 self.aligned_seq = "".join(d.nt_align_code.values)
449 self.length = len([ x for x in self.aligned_seq if x != "-" ]) 449 self.length = len([ x for x in self.aligned_seq if x != "-" ])
...@@ -561,11 +561,9 @@ class Chain: ...@@ -561,11 +561,9 @@ class Chain:
561 'alpha','beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi', 561 'alpha','beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi',
562 'bb_type','glyco_bond','form','ssZp','Dp', 562 'bb_type','glyco_bond','form','ssZp','Dp',
563 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 563 'eta','theta','eta_prime','theta_prime','eta_base','theta_base',
564 - 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering', 564 + 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering'
565 - 'P_x','P_y','P_z','C5prime_x','C5prime_y','C5prime_z'
566 ] 565 ]
567 self.data = self.data[cols] 566 self.data = self.data[cols]
568 - self.save() # save to file
569 567
570 def save(self, fformat = "csv"): 568 def save(self, fformat = "csv"):
571 # save to file 569 # save to file
...@@ -1310,6 +1308,7 @@ def alignment_nt_stats(f): ...@@ -1310,6 +1308,7 @@ def alignment_nt_stats(f):
1310 # Compute statistics per column 1308 # Compute statistics per column
1311 pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) 1309 pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
1312 frequencies = np.array([ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]).T 1310 frequencies = np.array([ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]).T
1311 + del pssm
1313 1312
1314 # For each sequence, find the right chain and save the PSSMs inside. 1313 # For each sequence, find the right chain and save the PSSMs inside.
1315 pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False) 1314 pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False)
...@@ -1320,11 +1319,18 @@ def alignment_nt_stats(f): ...@@ -1320,11 +1319,18 @@ def alignment_nt_stats(f):
1320 1319
1321 # get the right 3D chain: 1320 # get the right 3D chain:
1322 idx = chains_ids.index(s.id) 1321 idx = chains_ids.index(s.id)
1322 +
1323 + # call its method to set its frequencies, and save it
1323 list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies) 1324 list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies)
1325 + list_of_chains[idx].save(fformat='csv')
1326 +
1327 + del list_of_chains[idx] # saves a bit of memory because of the Chain object sizes
1328 + del chains_ids[idx] # to keep indexes aligned with list_of_chains
1324 pbar.update(1) 1329 pbar.update(1)
1325 - pbar.close()
1326 1330
1331 + pbar.close()
1327 1332
1333 + del rfam_acc_to_download[f] # We won't need this family's chain objects anymore, free up
1328 idxQueue.put(thr_idx) # replace the thread index in the queue 1334 idxQueue.put(thr_idx) # replace the thread index in the queue
1329 return 0 1335 return 0
1330 1336
...@@ -1551,7 +1557,8 @@ if __name__ == "__main__": ...@@ -1551,7 +1557,8 @@ if __name__ == "__main__":
1551 pdb_chain_id = nr[2].upper() 1557 pdb_chain_id = nr[2].upper()
1552 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1558 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1553 all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) 1559 all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label))
1554 - 1560 +
1561 + del full_structures_list
1555 n_chains = len(all_chains) 1562 n_chains = len(all_chains)
1556 print(">", validsymb, n_chains, "RNA chains of interest.") 1563 print(">", validsymb, n_chains, "RNA chains of interest.")
1557 1564
...@@ -1586,6 +1593,8 @@ if __name__ == "__main__": ...@@ -1586,6 +1593,8 @@ if __name__ == "__main__":
1586 1593
1587 print(f"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors).") 1594 print(f"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors).")
1588 del all_chains # Here ends its utility, so let's free some memory 1595 del all_chains # Here ends its utility, so let's free some memory
1596 + del joblist
1597 + del results
1589 1598
1590 if not HOMOLOGY: 1599 if not HOMOLOGY:
1591 # Save chains to file 1600 # Save chains to file
...@@ -1613,7 +1622,7 @@ if __name__ == "__main__": ...@@ -1613,7 +1622,7 @@ if __name__ == "__main__":
1613 rfam_acc_to_download[c.rfam_fam].append(c) 1622 rfam_acc_to_download[c.rfam_fam].append(c)
1614 mappings_list[c.rfam_fam].append(c.chain_label) 1623 mappings_list[c.rfam_fam].append(c.chain_label)
1615 pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv") 1624 pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv")
1616 - exit() 1625 + del mappings_list
1617 print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:") 1626 print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:")
1618 1627
1619 # Download the covariance models for all families 1628 # Download the covariance models for all families
...@@ -1632,6 +1641,7 @@ if __name__ == "__main__": ...@@ -1632,6 +1641,7 @@ if __name__ == "__main__":
1632 for f in fam_list: 1641 for f in fam_list:
1633 line = fam_stats[fam_stats["rfam_acc"]==f] 1642 line = fam_stats[fam_stats["rfam_acc"]==f]
1634 print(f"\t> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign") 1643 print(f"\t> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign")
1644 + del fam_stats
1635 1645
1636 # Download the sequences 1646 # Download the sequences
1637 for f in fam_list: 1647 for f in fam_list:
...@@ -1650,6 +1660,7 @@ if __name__ == "__main__": ...@@ -1650,6 +1660,7 @@ if __name__ == "__main__":
1650 1660
1651 # Execute the jobs 1661 # Execute the jobs
1652 execute_joblist(fulljoblist, printstats=True) # printstats=True will show a summary of time/memory usage of the jobs 1662 execute_joblist(fulljoblist, printstats=True) # printstats=True will show a summary of time/memory usage of the jobs
1663 + del fulljoblist
1653 1664
1654 # ========================================================================================== 1665 # ==========================================================================================
1655 # Now compute statistics on base variants at each position of every 3D chain 1666 # Now compute statistics on base variants at each position of every 3D chain
...@@ -1669,7 +1680,7 @@ if __name__ == "__main__": ...@@ -1669,7 +1680,7 @@ if __name__ == "__main__":
1669 1680
1670 # Start a process pool to dispatch the RNA families, 1681 # Start a process pool to dispatch the RNA families,
1671 # over multiple CPUs (one family by CPU) 1682 # over multiple CPUs (one family by CPU)
1672 - p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=ncores) 1683 + p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=int(ncores/2))
1673 1684
1674 fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True) 1685 fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True)
1675 for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family 1686 for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family
......
This diff is collapsed. Click to expand it.