Louis BECQUEY

Mapping inference from BGSU lists

This diff is collapsed. Click to expand it.

448 KB | W: | H:

399 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
...@@ -2,12 +2,15 @@ ...@@ -2,12 +2,15 @@
2 import os 2 import os
3 import numpy as np 3 import numpy as np
4 import pandas as pd 4 import pandas as pd
5 +import threading as th
5 import scipy.stats as st 6 import scipy.stats as st
6 import matplotlib.pyplot as plt 7 import matplotlib.pyplot as plt
7 import matplotlib.patches as ptch 8 import matplotlib.patches as ptch
8 from mpl_toolkits.mplot3d import axes3d 9 from mpl_toolkits.mplot3d import axes3d
9 from matplotlib import cm 10 from matplotlib import cm
10 from tqdm import tqdm 11 from tqdm import tqdm
12 +from multiprocessing import Pool
13 +from RNAnet import read_cpu_number
11 14
12 15
13 if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud 16 if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
...@@ -26,27 +29,35 @@ else: ...@@ -26,27 +29,35 @@ else:
26 print("I don't know that machine... I'm shy, maybe you should introduce yourself ?") 29 print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
27 exit(1) 30 exit(1)
28 31
29 -if __name__ == "__main__": 32 +def load_rna_frome_file(path_to_textfile):
30 - 33 + return pd.read_csv(path_to_textfile, sep=',', header=0, engine="c", index_col=0)
31 - #TODO: compute nt frequencies, chain lengths
32 34
33 - print("loading CSV files...") 35 +def reproduce_wadley_results(dfs, show=True):
34 - rna_points = []
35 all_etas = [] 36 all_etas = []
36 all_thetas = [] 37 all_thetas = []
37 - for csvfile in tqdm(os.listdir(path_to_3D_data + "pseudotorsions")): 38 + all_forms = []
38 - df = pd.read_csv(path_to_3D_data + "pseudotorsions/" + csvfile).drop('Unnamed: 0', axis=1) 39 + c = 0
40 + for df in dfs:
39 all_etas += list(df['eta'].values) 41 all_etas += list(df['eta'].values)
40 all_thetas += list(df['theta'].values) 42 all_thetas += list(df['theta'].values)
41 - rna_points.append(df) 43 + all_forms += list(df['form'].values)
44 + if (len([ x for x in df['eta'].values if x < 0 or x > 7]) or
45 + len([ x for x in df['theta'].values if x < 0 or x > 7])):
46 + c += 1
47 + print(c,"points on",len(dfs),"have non-radian angles !")
48 +
42 49
43 print("combining etas and thetas...") 50 print("combining etas and thetas...")
44 - # increase all the angles by 180° 51 + # # increase all the angles by 180°
45 - alldata = [ ((e+360)%360-180, (t+360)%360-180) 52 + # alldata = [ ((e+360)%360-180, (t+360)%360-180)
46 - for e, t in zip(all_etas, all_thetas) 53 + # for e, t in zip(all_etas, all_thetas)
54 + # if ('nan' not in str((e,t)))
55 + # and not(e<-150 and t<-110) and not (e>160 and t<-110) ]
56 + alldata = [ (e, t)
57 + for e, t, f in zip(all_etas, all_thetas, all_forms)
47 if ('nan' not in str((e,t))) 58 if ('nan' not in str((e,t)))
48 - and not(e<-150 and t<-110) and not (e>160 and t<-110) ] 59 + and f == '.' ]
49 - print(len(alldata), "couples of nts found.") 60 + print(len(alldata), "couples of non-helical nts found.")
50 61
51 x = np.array([ p[0] for p in alldata ]) 62 x = np.array([ p[0] for p in alldata ])
52 y = np.array([ p[1] for p in alldata ]) 63 y = np.array([ p[1] for p in alldata ])
...@@ -71,7 +82,7 @@ if __name__ == "__main__": ...@@ -71,7 +82,7 @@ if __name__ == "__main__":
71 plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5) 82 plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5)
72 ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") 83 ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
73 ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") 84 ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
74 - ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080')) 85 + # ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080'))
75 86
76 ax = fig.add_subplot(132, projection='3d') 87 ax = fig.add_subplot(132, projection='3d')
77 ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True) 88 ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True)
...@@ -86,4 +97,50 @@ if __name__ == "__main__": ...@@ -86,4 +97,50 @@ if __name__ == "__main__":
86 ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") 97 ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
87 ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") 98 ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
88 plt.savefig("results/clusters_rot180.png") 99 plt.savefig("results/clusters_rot180.png")
89 - plt.show() 100 + if show:
101 + plt.show()
102 +
103 +def stats_len(dfs):
104 + lengths = []
105 + full_lengths = []
106 + for r in dfs:
107 + nt_codes = r['nt_code'].values.tolist()
108 + lengths.append(len(nt_codes))
109 + full_lengths.append(len([ c for c in nt_codes if c != '-']))
110 +
111 +
112 +
113 +if __name__ == "__main__":
114 +
115 + #TODO: compute nt frequencies, chain lengths
116 +
117 + #################################################################
118 + # LOAD ALL FILES
119 + #################################################################
120 + print("Loading mappings list...")
121 + mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict()
122 +
123 + print("Loading datapoints from file...")
124 + filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
125 + rna_points = []
126 + p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
127 + pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
128 + for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
129 + rna_points.append(rna)
130 + pbar.update(1)
131 + pbar.close()
132 + p.close()
133 + p.join()
134 + npoints = len(rna_points)
135 + print(npoints, "RNA files loaded.")
136 +
137 + #################################################################
138 + # Define threads for the tasks
139 + #################################################################
140 + wadley_thr = th.Thread(target=reproduce_wadley_results, args=[rna_points])
141 +
142 +
143 + wadley_thr.start()
144 + wadley_thr.join()
145 +
146 +
...\ No newline at end of file ...\ No newline at end of file
......