statistics.py
5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python3.8
import os
import numpy as np
import pandas as pd
import threading as th
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.patches as ptch
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm
from tqdm import tqdm
from multiprocessing import Pool
from RNAnet import read_cpu_number
if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
path_to_3D_data = "/mnt/Data/RNA/3D/"
path_to_seq_data = "/mnt/Data/RNA/sequences/"
elif os.path.isdir("/home/persalteas"): # this is my personal workstation
path_to_3D_data = "/home/persalteas/Data/RNA/3D/"
path_to_seq_data = "/home/persalteas/Data/RNA/sequences/"
elif os.path.isdir("/home/lbecquey"): # this is the IBISC server
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
elif os.path.isdir("/nhome/siniac/lbecquey"): # this is the office PC
path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
else:
print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
exit(1)
def load_rna_frome_file(path_to_textfile):
return pd.read_csv(path_to_textfile, sep=',', header=0, engine="c", index_col=0)
def reproduce_wadley_results(dfs, show=True):
all_etas = []
all_thetas = []
all_forms = []
c = 0
for df in dfs:
all_etas += list(df['eta'].values)
all_thetas += list(df['theta'].values)
all_forms += list(df['form'].values)
if (len([ x for x in df['eta'].values if x < 0 or x > 7]) or
len([ x for x in df['theta'].values if x < 0 or x > 7])):
c += 1
print(c,"points on",len(dfs),"have non-radian angles !")
print("combining etas and thetas...")
# # increase all the angles by 180°
# alldata = [ ((e+360)%360-180, (t+360)%360-180)
# for e, t in zip(all_etas, all_thetas)
# if ('nan' not in str((e,t)))
# and not(e<-150 and t<-110) and not (e>160 and t<-110) ]
alldata = [ (e, t)
for e, t, f in zip(all_etas, all_thetas, all_forms)
if ('nan' not in str((e,t)))
and f == '.' ]
print(len(alldata), "couples of non-helical nts found.")
x = np.array([ p[0] for p in alldata ])
y = np.array([ p[1] for p in alldata ])
xmin, xmax = min(x), max(x)
ymin, ymax = min(y), max(y)
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
values = np.vstack([x, y])
kernel = st.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xx.shape)
sign_threshold = np.mean(f) + np.std(f)
z = np.where(f < sign_threshold, 0.0, f)
z_inc = np.where(f < sign_threshold, sign_threshold, f)
# histogram :
fig, axs = plt.subplots(1,3, figsize=(18, 6))
ax = fig.add_subplot(131)
plt.axhline(y=0, alpha=0.5, color='black')
plt.axvline(x=0, alpha=0.5, color='black')
plt.scatter(x, y, s=1, alpha=0.1)
plt.contourf(xx, yy, z, cmap=cm.BuPu, alpha=0.5)
ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
# ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080'))
ax = fig.add_subplot(132, projection='3d')
ax.plot_surface(xx, yy, z_inc, cmap=cm.coolwarm, linewidth=0, antialiased=True)
ax.set_title("\"Wadley plot\"\n$\\eta'$, $\\theta'$ pseudotorsions in 3D RNA structures\n(Massive peak removed in the red zone, = double helices)")
ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
ax = fig.add_subplot(133, projection='3d')
hist, xedges, yedges = np.histogram2d(x, y, bins=300, range=[[xmin, xmax], [ymin, ymax]])
xpos, ypos = np.meshgrid(xedges[:-1], yedges[:-1], indexing="ij")
ax.bar3d(xpos.ravel(), ypos.ravel(), 0, 0.5, 0.5, hist.ravel(), zsort='average')
ax.set_xlabel("$\\eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$")
ax.set_ylabel("$\\theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$")
plt.savefig("results/clusters_rot180.png")
if show:
plt.show()
def stats_len(dfs):
lengths = []
full_lengths = []
for r in dfs:
nt_codes = r['nt_code'].values.tolist()
lengths.append(len(nt_codes))
full_lengths.append(len([ c for c in nt_codes if c != '-']))
if __name__ == "__main__":
#TODO: compute nt frequencies, chain lengths
#################################################################
# LOAD ALL FILES
#################################################################
print("Loading mappings list...")
mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict()
print("Loading datapoints from file...")
filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
rna_points = []
p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
rna_points.append(rna)
pbar.update(1)
pbar.close()
p.close()
p.join()
npoints = len(rna_points)
print(npoints, "RNA files loaded.")
#################################################################
# Define threads for the tasks
#################################################################
wadley_thr = th.Thread(target=reproduce_wadley_results, args=[rna_points])
wadley_thr.start()
wadley_thr.join()