Lower RAM usage

Louis BECQUEY
Commit 47f8a2ba6da54f0a107340870c807df481d46d34 47f8a2ba 1 parent c9882ee5
Showing 2 changed files with 174 additions and 54 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @47f8a2b
+++ b/RNAnet.py
View file @47f8a2b
@@ -436,14 +436,14 @@ class Chain:
                 return
             
             # Creating a df for easy saving to CSV
-             df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam}.csv")
+             df.to_csv(path_to_3D_data + f"annotations/{self.chain_label}.{self.rfam_fam}.csv")
             del df
             print("\t> Saved", self.chain_label, f"annotations to CSV.\t\t{validsymb}", flush=True)
         else:
             print("\t> Computing", self.chain_label, f"annotations...\t{validsymb}\t(already done)", flush=True)
 
         # Now load data from the CSV file
-         d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam}.csv", index_col=0)
+         d = pd.read_csv(path_to_3D_data+f"annotations/{self.chain_label}.{self.rfam_fam}.csv", index_col=0)
         self.seq = "".join(d.nt_code.values)
         self.aligned_seq = "".join(d.nt_align_code.values)
         self.length = len([ x for x in self.aligned_seq if x != "-" ])
@@ -561,11 +561,9 @@ class Chain:
                 'alpha','beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi',
                 'bb_type','glyco_bond','form','ssZp','Dp',
                 'eta','theta','eta_prime','theta_prime','eta_base','theta_base',
-                 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering', 
-                 'P_x','P_y','P_z','C5prime_x','C5prime_y','C5prime_z'
+                 'v0', 'v1', 'v2', 'v3', 'v4', 'amplitude', 'phase_angle', 'puckering'
                ]
         self.data = self.data[cols]
-         self.save() # save to file
         
     def save(self, fformat = "csv"):
         # save to file
@@ -1310,6 +1308,7 @@ def alignment_nt_stats(f):
     # Compute statistics per column
     pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
     frequencies = np.array([ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]).T
+     del pssm
 
     # For each sequence, find the right chain and save the PSSMs inside.
     pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False)
@@ -1320,11 +1319,18 @@ def alignment_nt_stats(f):
 
         # get the right 3D chain:
         idx = chains_ids.index(s.id)
+ 
+         # call its method to set its frequencies, and save it
         list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies)
+         list_of_chains[idx].save(fformat='csv')
+ 
+         del list_of_chains[idx]  # saves a bit of memory because of the Chain object sizes
+         del chains_ids[idx]      # to keep indexes aligned with list_of_chains
         pbar.update(1)
-     pbar.close()
 
+     pbar.close()
 
+     del rfam_acc_to_download[f] # We won't need this family's chain objects anymore, free up
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
 
@@ -1551,7 +1557,8 @@ if __name__ == "__main__":
                 pdb_chain_id = nr[2].upper()
                 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
                 all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label))
-                         
+ 
+     del full_structures_list
     n_chains = len(all_chains)
     print(">", validsymb, n_chains, "RNA chains of interest.")
 
@@ -1586,6 +1593,8 @@ if __name__ == "__main__":
 
     print(f"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors).")
     del all_chains # Here ends its utility, so let's free some memory
+     del joblist
+     del results
 
     if not HOMOLOGY:
         # Save chains to file
@@ -1613,7 +1622,7 @@ if __name__ == "__main__":
             rfam_acc_to_download[c.rfam_fam].append(c)
             mappings_list[c.rfam_fam].append(c.chain_label)
     pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv")
-     exit()
+     del mappings_list
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:")
 
     # Download the covariance models for all families
@@ -1632,6 +1641,7 @@ if __name__ == "__main__":
         for f in fam_list:
             line = fam_stats[fam_stats["rfam_acc"]==f]
             print(f"\t> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign")
+     del fam_stats
 
     # Download the sequences
     for f in fam_list:
@@ -1650,6 +1660,7 @@ if __name__ == "__main__":
     
     # Execute the jobs
     execute_joblist(fulljoblist, printstats=True) # printstats=True will show a summary of time/memory usage of the jobs
+     del fulljoblist
 
     # ==========================================================================================
     # Now compute statistics on base variants at each position of every 3D chain
@@ -1669,7 +1680,7 @@ if __name__ == "__main__":
 
     # Start a process pool to dispatch the RNA families,
     # over multiple CPUs (one family by CPU)
-     p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=ncores)
+     p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=int(ncores/2))
 
     fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True) 
     for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family
--- a/statistics.py
View file @47f8a2b
+++ b/statistics.py
View file @47f8a2b
@@ -3,12 +3,16 @@ import os
 import numpy as np
 import pandas as pd
 import threading as th
+ import seaborn as sb
 import scipy.stats as st
 import matplotlib.pyplot as plt
 import matplotlib.patches as ptch
 from mpl_toolkits.mplot3d import axes3d
+ from Bio.Phylo.TreeConstruction import DistanceCalculator
+ from Bio import AlignIO
 from matplotlib import cm 
 from tqdm import tqdm
+ from functools import partial
 from multiprocessing import Pool
 from RNAnet import read_cpu_number
 
@@ -29,35 +33,74 @@ else:
     print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
     exit(1)
 
+ class DataPoint():
+     def __init__(self, path_to_textfile):
+         self.df = pd.read_csv(path_to_textfile, sep=',', header=0, engine="c", index_col=0)
+         self.family = path_to_textfile.split('.')[-1]
+         self.chain_label = path_to_textfile.split('.')[-2].split('/')[-1]
+ 
 def load_rna_frome_file(path_to_textfile):
-     return pd.read_csv(path_to_textfile, sep=',', header=0, engine="c", index_col=0)
+     return DataPoint(path_to_textfile)
+ 
+ def reproduce_wadley_results(points, show=False, filter_helical=None, carbon=4, zone=(2.7,3.3,3.5,4.5)):
+     """
+     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
+     See Wadley & Pyle (2007)
+     """
+ 
+     if carbon == 4:
+         angle = "eta"
+         xlabel = "$\\eta=C_4'^{i-1}-P^i-C_4'^i-P^{i+1}$"
+         ylabel = "$\\theta=P^i-C_4'^i-P^{i+1}-C_4'^{i+1}$"
+     elif carbon == 1:
+         angle = "eta_prime"
+         xlabel = "$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$"
+         ylabel = "$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$"
+     else:
+         exit("You overestimate my capabilities !")
 
- def reproduce_wadley_results(dfs, show=True):
     all_etas = []
     all_thetas = []
     all_forms = []
     c = 0
-     for df in dfs:
-         all_etas += list(df['eta'].values)
-         all_thetas += list(df['theta'].values)
-         all_forms += list(df['form'].values)
-         if (len([ x for x in df['eta'].values if x < 0 or x > 7]) or 
-             len([ x for x in df['theta'].values if x < 0 or x > 7])):
+     for p in points:
+         all_etas += list(p.df[angle].values)
+         all_thetas += list(p.df['th'+angle].values)
+         all_forms += list(p.df['form'].values)
+         if (len([ x for x in p.df[angle].values if x < 0 or x > 7]) or 
+             len([ x for x in p.df['th'+angle].values if x < 0 or x > 7])):
             c += 1
-     print(c,"points on",len(dfs),"have non-radian angles !")
- 
+     if c:
+         print(c,"points on",len(points),"have non-radian angles !")
 
     print("combining etas and thetas...")
-     # # increase all the angles by 180°
-     # alldata = [ ((e+360)%360-180, (t+360)%360-180) 
-     #             for e, t in zip(all_etas, all_thetas) 
-     #             if ('nan' not in str((e,t))) 
-     #             and not(e<-150 and t<-110) and not (e>160 and t<-110) ]
-     alldata = [ (e, t) 
-                 for e, t, f in zip(all_etas, all_thetas, all_forms) 
-                 if ('nan' not in str((e,t))) 
-                 and f == '.' ]
-     print(len(alldata), "couples of non-helical nts found.")
+     warn = ""
+     if not filter_helical:
+         alldata = [ (e, t) 
+                     for e, t in zip(all_etas, all_thetas) 
+                     if ('nan' not in str((e,t)))  ]
+     elif filter_helical == "form":
+         alldata = [ (e, t) 
+                     for e, t, f in zip(all_etas, all_thetas, all_forms) 
+                     if ('nan' not in str((e,t))) 
+                     and f == '.' ]
+         warn = "(helical nucleotides removed)"
+         print(len(alldata), "couples of non-helical nts found in", len(all_etas))
+     elif filter_helical == "zone":
+         alldata = [ (e, t) 
+                     for e, t in zip(all_etas, all_thetas) 
+                     if ('nan' not in str((e,t))) 
+                     and not (e>zone[0] and e<zone[1] and t>zone[2] and t<zone[3]) ]
+         warn = "(massive peak of helical nucleotides removed in red zone)"
+         print(len(alldata), "couples of non-helical nts found in", len(all_etas))
+     elif filter_helical == "both":
+         alldata = [ (e, t) 
+                     for e, t, f in zip(all_etas, all_thetas, all_forms) 
+                     if ('nan' not in str((e,t))) 
+                     and f == '.'
+                     and not (e>zone[0] and e<zone[1] and t>zone[2] and t<zone[3]) ]
+         warn = "(helical nucleotide and massive peak in the red zone removed)"
+         print(len(alldata), "couples of non-helical nts found in", len(all_etas))
 
     x = np.array([ p[0] for p in alldata ])
     y = np.array([ p[1] for p in alldata ])
@@ -75,38 +118,78 @@ def reproduce_wadley_results(dfs, show=True):
     # histogram : 
     fig, axs = plt.subplots(1,3, figsize=(18, 6))
     ax = fig.add_subplot(131)
+     ax.cla()
     
     plt.axhline(y=0, alpha=0.5, color='black')
     plt.axvline(x=0, alpha=0.5, color='black')
     plt.scatter(x, y, s=1, alpha=0.1)
     plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5)
-     ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
-     ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
-     # ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080'))
+     ax.set_xlabel(xlabel)
+     ax.set_ylabel(ylabel)
+     if filter_helical in ["zone","both"]:
+         ax.add_patch(ptch.Rectangle((zone[0],zone[2]),zone[1]-zone[0],zone[3]-zone[2], linewidth=1, edgecolor='r', facecolor='#ff000080'))
 
     ax = fig.add_subplot(132, projection='3d')
+     ax.cla()
     ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True)
-     ax.set_title("\"Wadley plot\"\n$\\eta'$, $\\theta'$ pseudotorsions in 3D RNA structures\n(Massive peak removed in the red zone, = double helices)")
-     ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
-     ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
+     ax.set_title(f"\"Wadley plot\" of {len(alldata)} nucleotides\nJoint distribution of pseudotorsions in 3D RNA structures\n" + warn)
+     ax.set_xlabel(xlabel)
+     ax.set_ylabel(ylabel)
 
     ax = fig.add_subplot(133, projection='3d')
-     hist, xedges, yedges = np.histogram2d(x, y, bins=300, range=[[xmin, xmax], [ymin, ymax]])
+     ax.cla()
+     hist, xedges, yedges = np.histogram2d(x, y, bins=200, range=[[xmin, xmax], [ymin, ymax]])
     xpos, ypos = np.meshgrid(xedges[:-1], yedges[:-1], indexing="ij")
-     ax.bar3d(xpos.ravel(), ypos.ravel(), 0, 0.5, 0.5, hist.ravel(), zsort='average')
-     ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
-     ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
-     plt.savefig("results/clusters_rot180.png")
+     ax.bar3d(xpos.ravel(), ypos.ravel(), 0, 0.2, 0.2, hist.ravel(), zsort='average')
+     ax.set_xlabel(xlabel)
+     ax.set_ylabel(ylabel)
+     plt.savefig(f"results/wadley_{angle}_{filter_helical}.png")
     if show:
         plt.show()
 
- def stats_len(dfs):
-     lengths = []
-     full_lengths = []
-     for r in dfs:
-         nt_codes = r['nt_code'].values.tolist()
-         lengths.append(len(nt_codes))
-         full_lengths.append(len([ c for c in nt_codes if c != '-']))
+ def stats_len(mappings_list, points):
+ 
+     lengths = {}
+     full_lengths = {}
+     for f in sorted(mappings_list.keys()):
+         lengths[f] = []
+         full_lengths[f] = []
+         for r in points:
+             if r.family != f: continue
+             nt_codes = r.df['nt_code'].values.tolist()
+             lengths[f].append(len(nt_codes))
+             full_lengths[f].append(len([ c for c in nt_codes if c != '-']))
+ 
+     # then for all families
+     lengths["all"] = []
+     full_lengths["all"] = []
+     for r in points:
+         nt_codes = r.df['nt_code'].values.tolist()
+         lengths["all"].append(len(nt_codes))
+         full_lengths["all"].append(len([ c for c in nt_codes if c != '-']))
+     dlengths = pd.DataFrame.from_dict(lengths, orient='index').transpose().drop(["all"], axis='columns').dropna(axis='columns', thresh=2)
+     dfulllengths = pd.DataFrame.from_dict(full_lengths, orient='index').transpose().drop(["all"], axis='columns').dropna(axis='columns', thresh=2)
+     print(dlengths.head())
+ 
+ 
+     axs = dlengths.plot.hist(figsize=(10, 15), bins=range(0,650,50), sharex=True, sharey=True, subplots=True, layout=(12,6), legend=False, log=True)
+     # for ax, f in zip(axs, sorted(mappings_list.keys())):
+     #     ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14)
+     plt.savefig("results/length_distribs.png")
+ 
+     axs = dfulllengths.plot.hist(figsize=(10, 15), bins=range(0,650,50), sharex=True, sharey=True, subplots=True, layout=(12,6), legend=False, log=True)
+     # for ax, f in zip(axs, sorted(mappings_list.keys())):
+     #     ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14)
+     plt.savefig("results/full_length_distribs.png")
+ 
+ def seq_idty(mappings_list):
+     idty_matrix = {}
+     dm = DistanceCalculator('identity')
+     for f in mappings_list.keys():
+         with open(path_to_seq_data+"realigned/"+f+"++.stk") as al_file:
+             al = AlignIO.read(al_file, "stockholm")
+         idty_matrix[f] = dm.get_distance(al)
+ 
 
 
 
@@ -117,12 +200,18 @@ if __name__ == "__main__":
     #################################################################
     #               LOAD ALL FILES
     #################################################################
+ 
     print("Loading mappings list...")
-     mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict()
+     mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
+     for k in mappings_list.keys():
+         mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ]
+     print(mappings_list)
+     exit()
+ 
 
     print("Loading datapoints from file...")
-     filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
     rna_points = []
+     filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
     p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
     pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
     for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
@@ -133,14 +222,34 @@ if __name__ == "__main__":
     p.join()
     npoints = len(rna_points)
     print(npoints, "RNA files loaded.")
+     exit()
 
     #################################################################
     #               Define threads for the tasks
     #################################################################
-     wadley_thr = th.Thread(target=reproduce_wadley_results, args=[rna_points])
- 
- 
-     wadley_thr.start()
-     wadley_thr.join()
+     # wadley_thr = []
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1, 'filter_helical': "zone"}))
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1, 'filter_helical': "form"}))
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1, 'filter_helical': "both"}))
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4, 'filter_helical': "form"}))
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4, 'filter_helical': "form"}))
+     # wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4, 'filter_helical': "both"}))
+     # seq_len_thr = th.Thread(target=partial(stats_len, mappings_list), args=[rna_points])
+     # dist_thr = th.Thread(target=seq_idty, args=[mappings_list])
+ 
+     # for t in wadley_thr:
+     #     t.start()
+     # seq_len_thr.start()
+     # dist_thr.start()
+ 
+     # for t in wadley_thr:
+     #     t.join()
+     # seq_len_thr.join()
+     # dist_thr.join()
+ 
+ 
+     # reproduce_wadley_results(rna_points)
+     seq_idty(mappings_list)
+     # stats_len(mappings_list, rna_points)
 
     
\ No newline at end of file