Showing
3 changed files
with
72 additions
and
15 deletions
This diff is collapsed. Click to expand it.
... | @@ -2,12 +2,15 @@ | ... | @@ -2,12 +2,15 @@ |
2 | import os | 2 | import os |
3 | import numpy as np | 3 | import numpy as np |
4 | import pandas as pd | 4 | import pandas as pd |
5 | +import threading as th | ||
5 | import scipy.stats as st | 6 | import scipy.stats as st |
6 | import matplotlib.pyplot as plt | 7 | import matplotlib.pyplot as plt |
7 | import matplotlib.patches as ptch | 8 | import matplotlib.patches as ptch |
8 | from mpl_toolkits.mplot3d import axes3d | 9 | from mpl_toolkits.mplot3d import axes3d |
9 | from matplotlib import cm | 10 | from matplotlib import cm |
10 | from tqdm import tqdm | 11 | from tqdm import tqdm |
12 | +from multiprocessing import Pool | ||
13 | +from RNAnet import read_cpu_number | ||
11 | 14 | ||
12 | 15 | ||
13 | if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud | 16 | if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud |
... | @@ -26,27 +29,35 @@ else: | ... | @@ -26,27 +29,35 @@ else: |
26 | print("I don't know that machine... I'm shy, maybe you should introduce yourself ?") | 29 | print("I don't know that machine... I'm shy, maybe you should introduce yourself ?") |
27 | exit(1) | 30 | exit(1) |
28 | 31 | ||
29 | -if __name__ == "__main__": | 32 | +def load_rna_frome_file(path_to_textfile): |
30 | - | 33 | + return pd.read_csv(path_to_textfile, sep=',', header=0, engine="c", index_col=0) |
31 | - #TODO: compute nt frequencies, chain lengths | ||
32 | 34 | ||
33 | - print("loading CSV files...") | 35 | +def reproduce_wadley_results(dfs, show=True): |
34 | - rna_points = [] | ||
35 | all_etas = [] | 36 | all_etas = [] |
36 | all_thetas = [] | 37 | all_thetas = [] |
37 | - for csvfile in tqdm(os.listdir(path_to_3D_data + "pseudotorsions")): | 38 | + all_forms = [] |
38 | - df = pd.read_csv(path_to_3D_data + "pseudotorsions/" + csvfile).drop('Unnamed: 0', axis=1) | 39 | + c = 0 |
40 | + for df in dfs: | ||
39 | all_etas += list(df['eta'].values) | 41 | all_etas += list(df['eta'].values) |
40 | all_thetas += list(df['theta'].values) | 42 | all_thetas += list(df['theta'].values) |
41 | - rna_points.append(df) | 43 | + all_forms += list(df['form'].values) |
44 | + if (len([ x for x in df['eta'].values if x < 0 or x > 7]) or | ||
45 | + len([ x for x in df['theta'].values if x < 0 or x > 7])): | ||
46 | + c += 1 | ||
47 | + print(c,"points on",len(dfs),"have non-radian angles !") | ||
48 | + | ||
42 | 49 | ||
43 | print("combining etas and thetas...") | 50 | print("combining etas and thetas...") |
44 | - # increase all the angles by 180° | 51 | + # # increase all the angles by 180° |
45 | - alldata = [ ((e+360)%360-180, (t+360)%360-180) | 52 | + # alldata = [ ((e+360)%360-180, (t+360)%360-180) |
46 | - for e, t in zip(all_etas, all_thetas) | 53 | + # for e, t in zip(all_etas, all_thetas) |
54 | + # if ('nan' not in str((e,t))) | ||
55 | + # and not(e<-150 and t<-110) and not (e>160 and t<-110) ] | ||
56 | + alldata = [ (e, t) | ||
57 | + for e, t, f in zip(all_etas, all_thetas, all_forms) | ||
47 | if ('nan' not in str((e,t))) | 58 | if ('nan' not in str((e,t))) |
48 | - and not(e<-150 and t<-110) and not (e>160 and t<-110) ] | 59 | + and f == '.' ] |
49 | - print(len(alldata), "couples of nts found.") | 60 | + print(len(alldata), "couples of non-helical nts found.") |
50 | 61 | ||
51 | x = np.array([ p[0] for p in alldata ]) | 62 | x = np.array([ p[0] for p in alldata ]) |
52 | y = np.array([ p[1] for p in alldata ]) | 63 | y = np.array([ p[1] for p in alldata ]) |
... | @@ -71,7 +82,7 @@ if __name__ == "__main__": | ... | @@ -71,7 +82,7 @@ if __name__ == "__main__": |
71 | plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5) | 82 | plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5) |
72 | ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") | 83 | ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") |
73 | ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") | 84 | ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") |
74 | - ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080')) | 85 | + # ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080')) |
75 | 86 | ||
76 | ax = fig.add_subplot(132, projection='3d') | 87 | ax = fig.add_subplot(132, projection='3d') |
77 | ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True) | 88 | ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True) |
... | @@ -86,4 +97,50 @@ if __name__ == "__main__": | ... | @@ -86,4 +97,50 @@ if __name__ == "__main__": |
86 | ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") | 97 | ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$") |
87 | ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") | 98 | ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$") |
88 | plt.savefig("results/clusters_rot180.png") | 99 | plt.savefig("results/clusters_rot180.png") |
89 | - plt.show() | 100 | + if show: |
101 | + plt.show() | ||
102 | + | ||
103 | +def stats_len(dfs): | ||
104 | + lengths = [] | ||
105 | + full_lengths = [] | ||
106 | + for r in dfs: | ||
107 | + nt_codes = r['nt_code'].values.tolist() | ||
108 | + lengths.append(len(nt_codes)) | ||
109 | + full_lengths.append(len([ c for c in nt_codes if c != '-'])) | ||
110 | + | ||
111 | + | ||
112 | + | ||
113 | +if __name__ == "__main__": | ||
114 | + | ||
115 | + #TODO: compute nt frequencies, chain lengths | ||
116 | + | ||
117 | + ################################################################# | ||
118 | + # LOAD ALL FILES | ||
119 | + ################################################################# | ||
120 | + print("Loading mappings list...") | ||
121 | + mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict() | ||
122 | + | ||
123 | + print("Loading datapoints from file...") | ||
124 | + filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f] | ||
125 | + rna_points = [] | ||
126 | + p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number()) | ||
127 | + pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True) | ||
128 | + for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)): | ||
129 | + rna_points.append(rna) | ||
130 | + pbar.update(1) | ||
131 | + pbar.close() | ||
132 | + p.close() | ||
133 | + p.join() | ||
134 | + npoints = len(rna_points) | ||
135 | + print(npoints, "RNA files loaded.") | ||
136 | + | ||
137 | + ################################################################# | ||
138 | + # Define threads for the tasks | ||
139 | + ################################################################# | ||
140 | + wadley_thr = th.Thread(target=reproduce_wadley_results, args=[rna_points]) | ||
141 | + | ||
142 | + | ||
143 | + wadley_thr.start() | ||
144 | + wadley_thr.join() | ||
145 | + | ||
146 | + | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment