connections between statistics.py and statistical_potential.py and some code mod…
…ifications and optimizations
Showing
2 changed files
with
355 additions
and
279 deletions
1 | +#!/usr/bin/python3 | ||
2 | + | ||
3 | +# RNANet statistics | ||
4 | +# Developed by Aglaé Tabot, 2021 | ||
5 | + | ||
6 | +# This file computes statistical potentials over the produced dataset. | ||
7 | +# THIS FILE IS NOT SUPPOSED TO BE RUN DIRECTLY. | ||
1 | 8 | ||
2 | import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings | 9 | import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings |
3 | import time | 10 | import time |
... | @@ -13,88 +20,37 @@ from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral | ... | @@ -13,88 +20,37 @@ from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral |
13 | from multiprocessing import Pool, Manager | 20 | from multiprocessing import Pool, Manager |
14 | from os import path | 21 | from os import path |
15 | from tqdm import tqdm | 22 | from tqdm import tqdm |
23 | +from collections import Counter | ||
16 | from setproctitle import setproctitle | 24 | from setproctitle import setproctitle |
17 | -from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions | 25 | +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions |
18 | from sklearn.mixture import GaussianMixture | 26 | from sklearn.mixture import GaussianMixture |
27 | +import scipy.stats as st | ||
19 | import warnings | 28 | import warnings |
20 | from pandas.core.common import SettingWithCopyWarning | 29 | from pandas.core.common import SettingWithCopyWarning |
21 | from itertools import combinations_with_replacement | 30 | from itertools import combinations_with_replacement |
22 | from math import * | 31 | from math import * |
23 | -# from geometric_stats import get_euclidian_distance | 32 | +from geometric_stats import get_euclidian_distance, GMM_histo |
24 | 33 | ||
25 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) | 34 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) |
26 | 35 | ||
27 | runDir = os.getcwd() | 36 | runDir = os.getcwd() |
28 | 37 | ||
29 | - | 38 | +@trace_unhandled_exceptions |
30 | -def liste_repres(fpath): | 39 | +def pyle_measures_for_potentials(name, s, thr_idx): |
31 | - repres=[] | ||
32 | - df=pd.read_csv(os.path.abspath(fpath)) | ||
33 | - for i in range(df.shape[0]): | ||
34 | - up_name=df["representative"][i] | ||
35 | - if '+' in up_name: | ||
36 | - up_name=up_name.split('+') | ||
37 | - for i in range(len(up_name)): | ||
38 | - chain=up_name[i].split('|') | ||
39 | - chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2] | ||
40 | - repres.append(chain+'.cif') | ||
41 | - else : | ||
42 | - up_name=up_name.split('|') | ||
43 | - low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2] | ||
44 | - repres.append(low_name+'.cif') | ||
45 | - | ||
46 | - return repres | ||
47 | - | ||
48 | -def measure_from_structure(f): # ou alors on le lance à partir de statistics.py? | ||
49 | - """ | ||
50 | - Do geometric measures required on a given filename | ||
51 | - """ | ||
52 | - | ||
53 | - name = f.split('.')[0] | ||
54 | - | ||
55 | - global idxQueue | ||
56 | - thr_idx = idxQueue.get() | ||
57 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_from_structure({f})") | ||
58 | - | ||
59 | - # Open the structure | ||
60 | - with warnings.catch_warnings(): | ||
61 | - # Ignore the PDB problems. This mostly warns that some chain is discontinuous. | ||
62 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) | ||
63 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | ||
64 | - parser=MMCIFParser() | ||
65 | - s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) | ||
66 | - | ||
67 | - measures_heavy_atoms(name, s, thr_idx) | ||
68 | - if DO_WADLEY_ANALYSIS: | ||
69 | - pyle_measures(name, s, thr_idx) | ||
70 | - | ||
71 | - idxQueue.put(thr_idx) # replace the thread index in the queue | ||
72 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
73 | - | ||
74 | -def pyle_measures(name, s, thr_idx): | ||
75 | # measure distances P-P, P-C1', P-C4', C1'-C1', C4'-C4' | 40 | # measure distances P-P, P-C1', P-C4', C1'-C1', C4'-C4' |
76 | - # between residues | 41 | + # between residues along the chain |
77 | - # along the chain | 42 | + # Requires a lot of storage space! |
78 | - if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')): | 43 | + if (path.isfile(runDir + '/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1'+name+'.csv')): |
79 | return | 44 | return |
80 | 45 | ||
81 | liste_dist=[] | 46 | liste_dist=[] |
82 | - #classes=[] | 47 | + |
83 | - #for i in range(0, 150, 5): | ||
84 | - #classes.append([i, i+5]) | ||
85 | - #classes.append([150, 300]) | ||
86 | - #occur_p_p=len(classes)*[0] | ||
87 | - #occur_p_c1=len(classes)*[0] | ||
88 | - #occur_p_c4=len(classes)*[0] | ||
89 | - #occur_c1_c1=len(classes)*[0] | ||
90 | - #occur_c4_c4=len(classes)*[0] | ||
91 | - #nb_occurs=[] | ||
92 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})") | 48 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})") |
93 | 49 | ||
94 | chain = next(s[0].get_chains()) | 50 | chain = next(s[0].get_chains()) |
95 | - #residues=list(chain.get_residues()) | 51 | + |
96 | for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False): | 52 | for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False): |
97 | - #res1=chain[i] | 53 | + |
98 | if res1.get_resname() in ["A", "C", "G", "U"]: | 54 | if res1.get_resname() in ["A", "C", "G", "U"]: |
99 | resnum1=list(res1.get_id())[1] | 55 | resnum1=list(res1.get_id())[1] |
100 | atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"] | 56 | atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"] |
... | @@ -109,7 +65,7 @@ def pyle_measures(name, s, thr_idx): | ... | @@ -109,7 +65,7 @@ def pyle_measures(name, s, thr_idx): |
109 | p_c1p=np.nan | 65 | p_c1p=np.nan |
110 | c4p_c4p=np.nan | 66 | c4p_c4p=np.nan |
111 | c1p_c1p=np.nan | 67 | c1p_c1p=np.nan |
112 | - #res2=chain[j] | 68 | + |
113 | if res2.get_resname() in ["A", "C", "G", "U"]: | 69 | if res2.get_resname() in ["A", "C", "G", "U"]: |
114 | 70 | ||
115 | atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"] | 71 | atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"] |
... | @@ -123,94 +79,218 @@ def pyle_measures(name, s, thr_idx): | ... | @@ -123,94 +79,218 @@ def pyle_measures(name, s, thr_idx): |
123 | c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2) | 79 | c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2) |
124 | 80 | ||
125 | liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p]) | 81 | liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p]) |
126 | - ''' | ||
127 | - for x in range(len(classes)): | ||
128 | - if classes[x][0] <= p_p <= classes[x][1]: | ||
129 | - occur_p_p[x]=occur_p_p[x]+1 | ||
130 | - if classes[x][0] <= p_c4p <= classes[x][1]: | ||
131 | - occur_p_c4[x]=occur_p_c4[x]+1 | ||
132 | - if classes[x][0] <= p_c1p <= classes[x][1]: | ||
133 | - occur_p_c1[x]=occur_p_c1[x]+1 | ||
134 | - if classes[x][0] <= c4p_c4p <= classes[x][1]: | ||
135 | - occur_c4_c4[x]=occur_c4_c4[x]+1 | ||
136 | - if classes[x][0] <= c1p_c1p <= classes[x][1]: | ||
137 | - occur_c1_c1[x]=occur_c1_c1[x]+1 | ||
138 | - ''' | ||
139 | - #for x in range(len(classes)): | ||
140 | - # for i in range(len(liste_dist)): | ||
141 | - # if classes[x][0] <= liste_dist[i][4] <= classes[x][1]: | ||
142 | - # occur_p_p[x]=occur_p_p[x]+1 | ||
143 | - # if classes[x][0] <= liste_dist[i][5] <= classes[x][1]: | ||
144 | - # occur_p_c4[x]=occur_p_c4[x]+1 | ||
145 | - # if classes[x][0] <= liste_dist[i][6] <= classes[x][1]: | ||
146 | - # occur_p_c1[x]=occur_p_c1[x]+1 | ||
147 | - # if classes[x][0] <= liste_dist[i][7] <= classes[x][1]: | ||
148 | - # occur_c4_c4[x]=occur_c4_c4[x]+1 | ||
149 | - # if classes[x][0] <= liste_dist[i][8] <= classes[x][1]: | ||
150 | - # occur_c1_c1[x]=occur_c1_c1[x]+1 | ||
151 | - #nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]]) | ||
152 | - #df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"]) | ||
153 | - # return df | ||
154 | - # nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4]) | ||
155 | - # print(nb_occurs) | ||
156 | - # return nb_occurs | ||
157 | - | ||
158 | 82 | ||
83 | + | ||
159 | df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]) | 84 | df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]) |
160 | - df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv") | 85 | + df.to_csv(runDir + "/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1" + name + ".csv") |
161 | 86 | ||
162 | -def gmm_pyle_type(ntpair, data): | 87 | +@trace_unhandled_exceptions |
88 | +def gmm_pyle_type(ntpair, data, scan): | ||
163 | 89 | ||
164 | setproctitle(f"GMM (Pyle {ntpair} )") | 90 | setproctitle(f"GMM (Pyle {ntpair} )") |
165 | 91 | ||
166 | - os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True) | 92 | + os.makedirs(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/", exist_ok=True) |
167 | - os.chdir(runDir + "/results/figures/GMM/Pyle/distances/") | 93 | + os.chdir(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/") |
168 | 94 | ||
169 | p_p=list(data["P-P"][~ np.isnan(data["P-P"])]) | 95 | p_p=list(data["P-P"][~ np.isnan(data["P-P"])]) |
170 | p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])]) | 96 | p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])]) |
171 | p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])]) | 97 | p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])]) |
172 | c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])]) | 98 | c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])]) |
173 | c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])]) | 99 | c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])]) |
174 | - #print(len(p_p)) | ||
175 | - # res2=list(data["resnum2"]) | ||
176 | - # res1=list(data["resnum1"]) | ||
177 | - # diff=[] | ||
178 | - # for i in range(len(res1)): | ||
179 | - # diff.append(res2[i]-res1[i]) | ||
180 | - # print(diff[:100]) | ||
181 | 100 | ||
182 | - GMM_histo(p_p, f"Distance P-P between {ntpair}", toric=False, hist=False, col="cyan") | 101 | + GMM_histo(p_p, f"Distance P-P between {ntpair}", scan, toric=False, hist=False, col="cyan") |
183 | - GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", toric=False, hist=False, col="tomato") | 102 | + GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", scan, toric=False, hist=False, col="tomato") |
184 | - GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", toric=False, hist=False, col="goldenrod") | 103 | + GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", scan, toric=False, hist=False, col="goldenrod") |
185 | - GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", toric=False, hist=False, col="magenta") | 104 | + GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", scan, toric=False, hist=False, col="magenta") |
186 | - GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", toric=False, hist=False, col="black") | 105 | + GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", scan, toric=False, hist=False, col="black") |
187 | - # GMM_histo(diff, f"Gap between {ntpair} ", toric=False, hist=False, col="tomato") | 106 | + |
188 | plt.xlabel("Distance (Angströms)") | 107 | plt.xlabel("Distance (Angströms)") |
189 | 108 | ||
190 | - # plt.xlabel("Number of residues") | ||
191 | - #plt.ylabel("Distance (Angströms)") | ||
192 | plt.title(f"GMM of distances for {ntpair} ", fontsize=10) | 109 | plt.title(f"GMM of distances for {ntpair} ", fontsize=10) |
193 | 110 | ||
194 | - # plt.savefig(f"Longueurs_Pyle_{ntpair}.png" ) | 111 | + plt.savefig(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/" + f"Distances_Pyle_{ntpair}.png" ) |
195 | - plt.savefig(f"Distances_Pyle_{ntpair}.png" ) | ||
196 | plt.close() | 112 | plt.close() |
197 | setproctitle(f"GMM (Pyle {ntpair} distances) finished") | 113 | setproctitle(f"GMM (Pyle {ntpair} distances) finished") |
198 | - | 114 | +@trace_unhandled_exceptions |
199 | -def gmm_pyle(): | 115 | +def gmm_pyle_per_type(scan): |
200 | 116 | ||
201 | setproctitle("GMM (Pyle model)") | 117 | setproctitle("GMM (Pyle model)") |
202 | 118 | ||
203 | - df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv")) | 119 | + df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances_i_i+1/distances.csv")) |
204 | 120 | ||
205 | - # dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"] | ||
206 | data=df | 121 | data=df |
207 | if len(data): | 122 | if len(data): |
208 | for b1 in ['A','C','G','U']: | 123 | for b1 in ['A','C','G','U']: |
209 | for b2 in ['A','C','G','U']: | 124 | for b2 in ['A','C','G','U']: |
210 | thisbases = data[(data.res1 == b1)&(data.res2 == b2)] | 125 | thisbases = data[(data.res1 == b1)&(data.res2 == b2)] |
211 | if len(thisbases): | 126 | if len(thisbases): |
212 | - gmm_pyle_type(b1+b2, thisbases) | 127 | + gmm_pyle_type(b1+b2, thisbases, scan) |
128 | + setproctitle("GMM (Pyle model) finished") | ||
129 | + | ||
130 | +# The next 7 functions are used to calculate the statistical potentials with the averaging method (residue-averaging) | ||
131 | +# for the Pyle model from the GMM results, and to plot the corresponding figures | ||
132 | + | ||
133 | + | ||
134 | +def pgaussred(x): | ||
135 | + """ | ||
136 | + distribution function of the normal distribution (= probability that a random variable distributed according to is less than x) | ||
137 | + calculation by numerical integration: 2000 slices per standard deviation | ||
138 | + result rounded to 7 decimal places | ||
139 | + """ | ||
140 | + if x==0: | ||
141 | + return 0.5 | ||
142 | + u=abs(x) | ||
143 | + n=int(u*2000) | ||
144 | + du=u/n | ||
145 | + k=1/sqrt(2*pi) | ||
146 | + u1=0 | ||
147 | + f1=k | ||
148 | + p=0.5 | ||
149 | + for i in range(0,n): | ||
150 | + u2=u1+du | ||
151 | + f2=k*exp(-0.5*u2*u2) | ||
152 | + p=p+(f1+f2)*du*0.5 | ||
153 | + u1=u2 | ||
154 | + f1=f2 | ||
155 | + if x<0: | ||
156 | + p = 1.0-p | ||
157 | + return round(p, 7) | ||
158 | + | ||
159 | +def proba(m, s, xinf, xsup): | ||
160 | + """ | ||
161 | + calculates the probability for a value to belong to the interval [xinf, xsup] | ||
162 | + for a normal distribution with mean m and standard deviation s | ||
163 | + """ | ||
164 | + prob=pgaussred((xsup-m)/s)-pgaussred((xinf-m)/s) | ||
165 | + | ||
166 | + return prob | ||
167 | + | ||
168 | +def extract_from_json(data, xinf, xsup): | ||
169 | + """ | ||
170 | + extracts the means and standard deviations of the json obtained with the GMM calculations | ||
171 | + and calculates from these parameters the probability for a value to belong to the interval [xinf, xsup]. | ||
172 | + """ | ||
173 | + p=[] | ||
174 | + p_tot=0 | ||
175 | + classes=[] | ||
176 | + with open(data + '.json') as json_data: | ||
177 | + data_dict = json.load(json_data) | ||
178 | + | ||
179 | + for i in range(len(data_dict['means'])): | ||
180 | + mean = data_dict['means'][i] | ||
181 | + if mean[0] == '[': | ||
182 | + mean=float((mean.split('['))[1].split(']')[0]) | ||
183 | + else : | ||
184 | + mean=float(mean) | ||
185 | + std = data_dict['std'][i] | ||
186 | + if std[0] == '[': | ||
187 | + std=float((std.split('['))[2].split(']')[0]) | ||
188 | + else: | ||
189 | + std=float(std) | ||
190 | + prob=proba(mean, std, xinf, xsup) | ||
191 | + p.append(prob) | ||
192 | + | ||
193 | + for x in p: | ||
194 | + p_tot = p_tot+x | ||
195 | + return p_tot | ||
196 | + | ||
197 | +def averaging(liste_data, name): | ||
198 | + """ | ||
199 | + creates a json that contains all the means and standard deviations of the parameters | ||
200 | + that we want to use to create the reference distribution with the averaging method | ||
201 | + """ | ||
202 | + | ||
203 | + curves=[] | ||
204 | + | ||
205 | + x = np.linspace(0,250,1000) | ||
206 | + | ||
207 | + summary_data = {} | ||
208 | + summary_data["measures"] = [] | ||
209 | + summary_data["means"] = [] | ||
210 | + summary_data["std"] = [] | ||
211 | + for data in liste_data: | ||
212 | + s=0 | ||
213 | + mean_tot=0 | ||
214 | + std_tot=0 | ||
215 | + with open(runDir + '/results/geometry/json/' + data + '.json') as json_data: | ||
216 | + data_dict = json.load(json_data) | ||
217 | + | ||
218 | + for i in range(len(data_dict['means'])): | ||
219 | + mean = float(data_dict['means'][i]) | ||
220 | + | ||
221 | + std = float(data_dict['std'][i]) | ||
222 | + | ||
223 | + weight = float(data_dict['weights'][i]) | ||
224 | + y = weight*st.norm.pdf(x, mean, std) | ||
225 | + mean_tot=mean_tot+(weight*mean) | ||
226 | + std_tot=std_tot+(weight*std) | ||
227 | + | ||
228 | + curves.append(y) | ||
229 | + summary_data["measures"].append(data) | ||
230 | + summary_data["means"].append(str(mean_tot)) | ||
231 | + summary_data["std"].append(str(std_tot)) | ||
232 | + | ||
233 | + with open(runDir + '/results/statistical_potential/json/Pyle/avg_' +name + ".json", 'w', encoding='utf-8') as f: | ||
234 | + json.dump(summary_data, f, indent=4) | ||
235 | + | ||
236 | +def potential_dist(data_obs, data_ref): | ||
237 | + name=data_obs.split('/')[-1] | ||
238 | + l=[] | ||
239 | + for k in range(0, 400, 5): | ||
240 | + obs=extract_from_json(data_obs, k, k+5) | ||
241 | + ref=extract_from_json(data_ref, k, k+5) | ||
242 | + if obs != 0.0 and ref != 0.0: | ||
243 | + u = -log(obs/ref) | ||
244 | + l.append([k, k+5, obs, ref, u]) | ||
245 | + l.append([k+5, k+5, obs, ref, u]) | ||
246 | + | ||
247 | + else: | ||
248 | + l.append([k, k+5, obs, ref, None]) | ||
249 | + l.append([k+5, k+5, obs, ref, None]) | ||
250 | + df=pd.DataFrame(l, columns=["Xinf", "Xsup", "Pobs", "Pref", "- ln(Pobs/Pref)"]) | ||
251 | + df.to_csv(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + name + ".csv") | ||
252 | + | ||
253 | +def courbe_stat_pot(f): | ||
254 | + name=f.split('/')[-1] | ||
255 | + name=name.split('.')[0] | ||
256 | + df=pd.read_csv(f) | ||
257 | + E=list(df["- ln(Pobs/Pref)"][~ np.isnan(df["- ln(Pobs/Pref)"])]) | ||
258 | + max_E=max(E) | ||
259 | + min_E=min(E) | ||
260 | + index_with_nan=df.index[df.iloc[:,5].isnull()] | ||
261 | + for i in index_with_nan: | ||
262 | + df.iloc[i, 5]=max_E # we replace the nan by the maximum found energy | ||
263 | + | ||
213 | 264 | ||
265 | + y=list(df["- ln(Pobs/Pref)"]) | ||
266 | + x=list(df["Xinf"]) | ||
267 | + | ||
268 | + x=np.array(x) | ||
269 | + y=np.array(y) | ||
270 | + plt.plot(x, y) | ||
271 | + plt.xlabel("Distance (Angström)") | ||
272 | + plt.ylabel("- ln(Pobs/Pref)") | ||
273 | + plt.title(name) | ||
274 | + plt.savefig(runDir + "/results/statistical_potential/figures/Pyle/avg_statistical_pot_ " + name + ".png") | ||
275 | + plt.close() | ||
276 | + | ||
277 | +def stat_potential_pyle(): | ||
278 | + pyle = ["P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"] | ||
279 | + nt = ["A", "C", "G", "U"] | ||
280 | + for pair in pyle: | ||
281 | + type_list=[] | ||
282 | + for res1 in nt: | ||
283 | + for res2 in nt: | ||
284 | + setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2})") | ||
285 | + type_list.append(f"Distance {pair} between {res1}{res2}") | ||
286 | + averaging(type_list, f"{pair}") | ||
287 | + for dist in type_list: | ||
288 | + potential_dist(runDir + '/results/geometry/json/' + dist, runDir + f'/results/statistical_potential/json/Pyle/avg_{pair}') | ||
289 | + courbe_stat_pot(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + dist + '.csv') | ||
290 | + setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2}) finished") | ||
291 | + | ||
292 | + | ||
293 | +@trace_unhandled_exceptions | ||
214 | def measures_heavy_atoms(name, s, thr_idx): | 294 | def measures_heavy_atoms(name, s, thr_idx): |
215 | """ | 295 | """ |
216 | create a list of all possible pairs of atoms among the 85 types of atoms | 296 | create a list of all possible pairs of atoms among the 85 types of atoms |
... | @@ -312,21 +392,17 @@ def measures_heavy_atoms(name, s, thr_idx): | ... | @@ -312,21 +392,17 @@ def measures_heavy_atoms(name, s, thr_idx): |
312 | df_occur = pd.DataFrame(occurences, columns=["atom_pair_type"] + classes_str) | 392 | df_occur = pd.DataFrame(occurences, columns=["atom_pair_type"] + classes_str) |
313 | # save this | 393 | # save this |
314 | df_occur.to_csv(runDir + "/results/geometry/all-atoms/distances_classes/distances_classes_occur_" + name + ".csv") | 394 | df_occur.to_csv(runDir + "/results/geometry/all-atoms/distances_classes/distances_classes_occur_" + name + ".csv") |
315 | - | 395 | +@trace_unhandled_exceptions |
316 | def count_occur_atom_dist(fpath, outfilename): | 396 | def count_occur_atom_dist(fpath, outfilename): |
317 | """ | 397 | """ |
318 | After having calculated the number of occurrences of the distances between pairs of atoms | 398 | After having calculated the number of occurrences of the distances between pairs of atoms |
319 | sorted by distance class and by type of pair for each RNA chain, | 399 | sorted by distance class and by type of pair for each RNA chain, |
320 | we add these occurrences one by one to obtain a single dataframe with the total number of occurrences | 400 | we add these occurrences one by one to obtain a single dataframe with the total number of occurrences |
321 | """ | 401 | """ |
322 | - | 402 | + |
323 | - global idxQueue | 403 | + setproctitle(f"Addition of occurences of {fpath}") |
324 | - thr_idx = idxQueue.get() | ||
325 | - setproctitle(f"Worker {thr_idx+1} : Addition of occurences of {fpath}") | ||
326 | - | ||
327 | liste=os.listdir(fpath) | 404 | liste=os.listdir(fpath) |
328 | - | 405 | + |
329 | - pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False) | ||
330 | df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) | 406 | df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) |
331 | 407 | ||
332 | for f in range(len(liste)): | 408 | for f in range(len(liste)): |
... | @@ -334,23 +410,20 @@ def count_occur_atom_dist(fpath, outfilename): | ... | @@ -334,23 +410,20 @@ def count_occur_atom_dist(fpath, outfilename): |
334 | for i in range(df.shape[0]): | 410 | for i in range(df.shape[0]): |
335 | for j in range(2, df.shape[1]): | 411 | for j in range(2, df.shape[1]): |
336 | df_tot.iloc[i, j]=df_tot.iloc[i, j] + df.iloc[i, j] | 412 | df_tot.iloc[i, j]=df_tot.iloc[i, j] + df.iloc[i, j] |
337 | - pbar.update(1) | 413 | + |
338 | df_tot.to_csv(fpath+outfilename) | 414 | df_tot.to_csv(fpath+outfilename) |
339 | - idxQueue.put(thr_idx) # replace the thread index in the queue | ||
340 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
341 | 415 | ||
416 | + setproctitle(f"Addition of occurences of {fpath} finished") | ||
417 | + | ||
418 | +@trace_unhandled_exceptions | ||
342 | def mole_fraction(fpath, outfilename): | 419 | def mole_fraction(fpath, outfilename): |
343 | """ | 420 | """ |
344 | Calculation of the mole fraction of each type of atom within the set of structures | 421 | Calculation of the mole fraction of each type of atom within the set of structures |
345 | - """ | 422 | + """ |
346 | - | 423 | + setproctitle(f"Calculation of mole fractions of {fpath}") |
347 | - global idxQueue | ||
348 | - thr_idx = idxQueue.get() | ||
349 | - setproctitle(f"Worker {thr_idx+1} : Calculation of mole fractions of {fpath}") | ||
350 | - | ||
351 | liste=os.listdir(fpath) | 424 | liste=os.listdir(fpath) |
352 | 425 | ||
353 | - pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False) | 426 | + |
354 | df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) | 427 | df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) |
355 | del df_tot["Unnamed: 0"] | 428 | del df_tot["Unnamed: 0"] |
356 | 429 | ||
... | @@ -359,31 +432,29 @@ def mole_fraction(fpath, outfilename): | ... | @@ -359,31 +432,29 @@ def mole_fraction(fpath, outfilename): |
359 | del df["Unnamed: 0"] | 432 | del df["Unnamed: 0"] |
360 | for i in range(df.shape[0]): | 433 | for i in range(df.shape[0]): |
361 | df_tot.iloc[i, 1]=df_tot.iloc[i, 1] + df.iloc[i, 1] | 434 | df_tot.iloc[i, 1]=df_tot.iloc[i, 1] + df.iloc[i, 1] |
362 | - pbar.update(1) | 435 | + |
363 | total=sum(list(df_tot["count"])) | 436 | total=sum(list(df_tot["count"])) |
364 | fract=[] | 437 | fract=[] |
365 | for i in range(df_tot.shape[0]): | 438 | for i in range(df_tot.shape[0]): |
366 | - # df_tot.iloc[i, 1]=df_tot.iloc[i, 1]/total | 439 | + |
367 | fract.append(df_tot.iloc[i, 1]/total) | 440 | fract.append(df_tot.iloc[i, 1]/total) |
368 | df_tot["mole_fraction"]=fract | 441 | df_tot["mole_fraction"]=fract |
369 | - file_list=os.listdir(fpath) | 442 | + # file_list=os.listdir(fpath) |
370 | - file_list=[fpath+x for x in file_list] | 443 | + # file_list=[fpath+x for x in file_list] |
371 | # for f in file_list: #after processing, deletion of csv by structure | 444 | # for f in file_list: #after processing, deletion of csv by structure |
372 | # os.remove(f) | 445 | # os.remove(f) |
373 | - df_tot.to_csv(fpath+outfilename) #ou alors juste un return | 446 | + df_tot.to_csv(fpath+outfilename) |
374 | - idxQueue.put(thr_idx) # replace the thread index in the queue | ||
375 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
376 | 447 | ||
448 | + setproctitle(f"Calculation of mole fractions of {fpath} finished") | ||
449 | + | ||
450 | +@trace_unhandled_exceptions | ||
377 | def compute_ratio_from_csv(fpath, avg_file, qch_file): | 451 | def compute_ratio_from_csv(fpath, avg_file, qch_file): |
378 | """ | 452 | """ |
379 | Calculation of observed and reference probabilities | 453 | Calculation of observed and reference probabilities |
380 | according to the methods chosen to establish the reference state | 454 | according to the methods chosen to establish the reference state |
381 | Then calculation of the Pobs / Pref ratio | 455 | Then calculation of the Pobs / Pref ratio |
382 | - """ | 456 | + """ |
383 | - # global idxQueue | 457 | + setproctitle("Compute ratio from csv") |
384 | - # thr_idx = idxQueue.get() | ||
385 | - # setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}") | ||
386 | - | ||
387 | df = pd.read_csv(fpath) | 458 | df = pd.read_csv(fpath) |
388 | del df['Unnamed: 0'] | 459 | del df['Unnamed: 0'] |
389 | del df['Unnamed: 0.1'] | 460 | del df['Unnamed: 0.1'] |
... | @@ -401,9 +472,8 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): | ... | @@ -401,9 +472,8 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): |
401 | s_tot=sum(sommes) | 472 | s_tot=sum(sommes) |
402 | for s in sommes: | 473 | for s in sommes: |
403 | pref_avg_list.append(s/s_tot) | 474 | pref_avg_list.append(s/s_tot) |
404 | - print(sommes) | 475 | + |
405 | - # return | 476 | + |
406 | - # pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+avg_file, leave=False) | ||
407 | df_bis=df.copy() | 477 | df_bis=df.copy() |
408 | # if method=averaging | 478 | # if method=averaging |
409 | for i in range(df.shape[0]): | 479 | for i in range(df.shape[0]): |
... | @@ -412,14 +482,7 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): | ... | @@ -412,14 +482,7 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): |
412 | df_bis.iloc[i,j]=df.iloc[i,j]/(sum(df.iloc[i, k] for k in range(1, df.shape[1]))) | 482 | df_bis.iloc[i,j]=df.iloc[i,j]/(sum(df.iloc[i, k] for k in range(1, df.shape[1]))) |
413 | # ratio between the observed probability and the reference probability | 483 | # ratio between the observed probability and the reference probability |
414 | df_bis.iloc[i,j]=df_bis.iloc[i,j]/pref_avg_list[j-1] | 484 | df_bis.iloc[i,j]=df_bis.iloc[i,j]/pref_avg_list[j-1] |
415 | - # pbar.update(1) | 485 | + |
416 | - # idxQueue.put(thr_idx) # replace the thread index in the queue | ||
417 | - # setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
418 | - print(df_bis) | ||
419 | - | ||
420 | - # thr_idx = idxQueue.get() | ||
421 | - # setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}") | ||
422 | - # pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+qch_file, leave=False) | ||
423 | 486 | ||
424 | # if method=quasi-chemical | 487 | # if method=quasi-chemical |
425 | df_ter=df.copy() | 488 | df_ter=df.copy() |
... | @@ -430,20 +493,18 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): | ... | @@ -430,20 +493,18 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file): |
430 | # calculate the mole fractions of the atom corresponding to the pair in row i | 493 | # calculate the mole fractions of the atom corresponding to the pair in row i |
431 | if atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[0] or atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[1]: | 494 | if atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[0] or atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[1]: |
432 | x.append(atom_count.at[k, 'mole_fraction']) | 495 | x.append(atom_count.at[k, 'mole_fraction']) |
433 | - # print(x) | 496 | + |
434 | for j in range(1, df.shape[1]): | 497 | for j in range(1, df.shape[1]): |
435 | if len(x)==2: | 498 | if len(x)==2: |
436 | df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[1]*sommes[j-1]) # ratio for qchA method (Nijobs(r)/xi*xj*Nobs(r)) | 499 | df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[1]*sommes[j-1]) # ratio for qchA method (Nijobs(r)/xi*xj*Nobs(r)) |
437 | if len(x)==1: | 500 | if len(x)==1: |
438 | df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[0]*sommes[j-1]) | 501 | df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[0]*sommes[j-1]) |
439 | - # pbar.update(1) | 502 | + df_bis.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + avg_file) |
440 | - # idxQueue.put(thr_idx) # replace the thread index in the queue | 503 | + df_ter.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + qch_file) |
441 | - # setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
442 | - print(df_ter) | ||
443 | - # ajouter ce qui est dans la fct save_into_database ici? | ||
444 | - df_bis.to_csv(runDir + '/results/statistical_potential/all-atoms/' + avg_file) | ||
445 | - df_ter.to_csv(runDir + '/results/statistical_potential/all-atoms/' + qch_file) | ||
446 | 504 | ||
505 | + setproctitle("Compute ratio from csv finished") | ||
506 | + | ||
507 | +@trace_unhandled_exceptions | ||
447 | def sql_new_table(conn): | 508 | def sql_new_table(conn): |
448 | cur = conn.cursor() | 509 | cur = conn.cursor() |
449 | sql = """ CREATE TABLE IF NOT EXISTS all_atoms ( | 510 | sql = """ CREATE TABLE IF NOT EXISTS all_atoms ( |
... | @@ -482,10 +543,11 @@ def sql_new_table(conn): | ... | @@ -482,10 +543,11 @@ def sql_new_table(conn): |
482 | 543 | ||
483 | conn.execute("pragma journal_mode=wal") | 544 | conn.execute("pragma journal_mode=wal") |
484 | #conn.close() | 545 | #conn.close() |
485 | - | 546 | +@trace_unhandled_exceptions |
486 | def save_into_database(): | 547 | def save_into_database(): |
487 | - df_avg = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/avg_ratio_pobs_pref.csv') | 548 | + setproctitle("Saving statistical potentials(avg, qch) into database") |
488 | - df_qch = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/qch_ratio_pobs_pref.csv') | 549 | + df_avg = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/avg_ratio_pobs_pref.csv') |
550 | + df_qch = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/qch_ratio_pobs_pref.csv') | ||
489 | ratio_list=[] | 551 | ratio_list=[] |
490 | del df_avg['Unnamed: 0'] | 552 | del df_avg['Unnamed: 0'] |
491 | del df_qch['Unnamed: 0'] | 553 | del df_qch['Unnamed: 0'] |
... | @@ -495,80 +557,57 @@ def save_into_database(): | ... | @@ -495,80 +557,57 @@ def save_into_database(): |
495 | ratio_list.append([df_avg.iloc[i,0], df_avg.columns[j], df_avg.iloc[i, j], df_qch.iloc[i,j]]) | 557 | ratio_list.append([df_avg.iloc[i,0], df_avg.columns[j], df_avg.iloc[i, j], df_qch.iloc[i,j]]) |
496 | 558 | ||
497 | 559 | ||
498 | - with sqlite3.connect(runDir + "/results/Stats.db", timeout=20.0) as conn: | 560 | + with sqlite3.connect(runDir + "/results/Stat_potential.db", timeout=20.0) as conn: |
499 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | 561 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query |
500 | # We use the REPLACE keyword to get the latest information | 562 | # We use the REPLACE keyword to get the latest information |
501 | sql_execute(conn, """INSERT OR REPLACE INTO all_atoms (atom_pair, distance_bin, avg_ratio_pobs_pref, qch_ratio_pobs_pref ) | 563 | sql_execute(conn, """INSERT OR REPLACE INTO all_atoms (atom_pair, distance_bin, avg_ratio_pobs_pref, qch_ratio_pobs_pref ) |
502 | VALUES (?, ?, ?, ?);""", | 564 | VALUES (?, ?, ?, ?);""", |
503 | many=True, | 565 | many=True, |
504 | data=ratio_list | 566 | data=ratio_list |
505 | - ) | 567 | + ) |
506 | - | 568 | + setproctitle("Saving statistical potentials(avg, qch) into database finished") |
507 | -def stat_potential(pair, f, method): | 569 | +@trace_unhandled_exceptions |
570 | +def stat_potential(f, method): | ||
508 | df=pd.read_csv(f) | 571 | df=pd.read_csv(f) |
509 | del df['Unnamed: 0'] | 572 | del df['Unnamed: 0'] |
510 | df.set_index('atom_pair_type', inplace=True) | 573 | df.set_index('atom_pair_type', inplace=True) |
511 | df=df.T | 574 | df=df.T |
512 | - c=df[pair].tolist() | 575 | + setproctitle(f"RNANet statistics.py stat_potential({method})") |
513 | - new=[] | 576 | + for pair in df.columns: |
514 | - for x in c: | 577 | + c=df[pair].tolist() |
515 | - if x!=0.0 and not np.isnan(x): | 578 | + new=[] |
516 | - new.append(-log(x)) | 579 | + for x in c: |
517 | - new.append(-log(x)) | 580 | + if x!=0.0 and not np.isnan(x): |
518 | - if x==0.0: | 581 | + new.append(-log(x)) |
519 | - new.append(0.0) | 582 | + new.append(-log(x)) |
520 | - new.append(0.0) | 583 | + if x==0.0: |
521 | - if np.isnan(x): | 584 | + new.append(0.0) |
522 | - new.append(0.0) | 585 | + new.append(0.0) |
523 | - new.append(0.0) | 586 | + if np.isnan(x): |
524 | - abs=[] | 587 | + new.append(0.0) |
525 | - | 588 | + new.append(0.0) |
526 | - for i in range(0, 150, 5): | 589 | + abs=[] |
527 | - abs.append(i) | 590 | + |
528 | - abs.append(i+5) | 591 | + for i in range(0, 150, 5): |
529 | - abs.append(150) | 592 | + abs.append(i) |
530 | - abs.append(300) | 593 | + abs.append(i+5) |
531 | - print(new) | 594 | + abs.append(150) |
532 | - print(abs) | 595 | + abs.append(300) |
533 | - | 596 | + |
534 | - x=abs | 597 | + x=abs |
535 | - y=new | 598 | + y=new |
536 | - x=np.array(x) | 599 | + x=np.array(x) |
537 | - y=np.array(y) | 600 | + y=np.array(y) |
538 | - plt.plot(x, y) | 601 | + plt.plot(x, y) |
539 | - plt.xlabel('Distance') | 602 | + plt.xlabel('Distance') |
540 | - plt.ylabel("- ln(Pobs/Pref)") | 603 | + plt.ylabel("- ln(Pobs/Pref)") |
541 | - plt.title(f'Statistical potential of {pair} distance ({method} method)') | 604 | + plt.title(f'Statistical potential of {pair} distance ({method} method)') |
542 | - plt.savefig(f"/home/atabot/RNANet/results/statistical_potential/all-atoms/{method}_statistical_pot_{pair}.png") | 605 | + plt.savefig(runDir + f"/results/statistical_potential/figures/all-atoms/{method}_method/{method}_statistical_pot_{pair}.png") |
543 | - plt.close() | 606 | + plt.close() |
607 | + | ||
608 | + setproctitle(f"RNANet statistics.py stat_potential({method}) finished") | ||
544 | 609 | ||
545 | -# mettre en option le choix de la méthode? | ||
546 | 610 | ||
547 | 611 | ||
548 | if __name__ == "__main__": | 612 | if __name__ == "__main__": |
549 | - os.makedirs(runDir + '/results/statistical_potential/all-atoms/', exist_ok=True) | ||
550 | - # compute_ratio_from_csv('/home/atabot/RNANet/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv') | ||
551 | - # exit(1) | ||
552 | - with sqlite3.connect(runDir + '/results/Stats.db') as conn: | ||
553 | - sql_new_table(conn) | ||
554 | - save_into_database() | ||
555 | - exit(1) | ||
556 | - | ||
557 | - # count_occur_atom_dist(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv') | ||
558 | - # os.makedirs(runDir + '/results/figures/all-atoms/distances/hist/', exist_ok=True) | ||
559 | - # histo_occur(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv') | ||
560 | - # exit(1) | ||
561 | - mole_fraction(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv') | ||
562 | - exit(1) | ||
563 | - # print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0])) | ||
564 | - if n_unmapped_chains: | ||
565 | - # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) | ||
566 | - os.makedirs(runDir+"/results/geometry/all-atoms/distances_classes/", exist_ok=True) | ||
567 | - os.makedirs(runDir+"/results/geometry/all-atoms/atom_count/", exist_ok=True) | ||
568 | - # liste_struct = os.listdir(path_to_3D_data + "rna_only") | ||
569 | - liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')[:100] | ||
570 | - # measure_from_structure(liste_struct[0]) | ||
571 | - # exit(1) | ||
572 | - #measure_from_structure('5e81_1_2K.cif') | ||
573 | - #exit(1) | ||
574 | - # concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv') | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
613 | + print("This file is not supposed to be run directly. Run statistics.py instead.") | ... | ... |
... | @@ -38,6 +38,7 @@ LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam | ... | @@ -38,6 +38,7 @@ LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam |
38 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 | 38 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 |
39 | 39 | ||
40 | from geometric_stats import * # after definition of the variables | 40 | from geometric_stats import * # after definition of the variables |
41 | +from statistical_potential import * | ||
41 | 42 | ||
42 | @trace_unhandled_exceptions | 43 | @trace_unhandled_exceptions |
43 | def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): | 44 | def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): |
... | @@ -1181,13 +1182,16 @@ def measure_from_structure(f): | ... | @@ -1181,13 +1182,16 @@ def measure_from_structure(f): |
1181 | s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) | 1182 | s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) |
1182 | 1183 | ||
1183 | #pyle_measures(name, s, thr_idx) | 1184 | #pyle_measures(name, s, thr_idx) |
1184 | - measures_aa(name, s, thr_idx) | 1185 | + # measures_aa(name, s, thr_idx) |
1185 | if DO_HIRE_RNA_MEASURES: | 1186 | if DO_HIRE_RNA_MEASURES: |
1186 | measures_hrna(name, s, thr_idx) | 1187 | measures_hrna(name, s, thr_idx) |
1187 | measures_hrna_basepairs(name, s, path_to_3D_data, thr_idx) | 1188 | measures_hrna_basepairs(name, s, path_to_3D_data, thr_idx) |
1188 | if DO_WADLEY_ANALYSIS: | 1189 | if DO_WADLEY_ANALYSIS: |
1189 | measures_pyle(name, s, thr_idx) | 1190 | measures_pyle(name, s, thr_idx) |
1190 | - | 1191 | + if DO_STAT_POTENTIAL_MEASURES: |
1192 | + # measures_heavy_atoms(name, s, thr_idx) | ||
1193 | + if DO_WADLEY_ANALYSIS: | ||
1194 | + pyle_measures_for_potentials(name, s, thr_idx) | ||
1191 | idxQueue.put(thr_idx) # replace the thread index in the queue | 1195 | idxQueue.put(thr_idx) # replace the thread index in the queue |
1192 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | 1196 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") |
1193 | 1197 | ||
... | @@ -1335,10 +1339,11 @@ if __name__ == "__main__": | ... | @@ -1335,10 +1339,11 @@ if __name__ == "__main__": |
1335 | REDUNDANT_DIST_MAT = True | 1339 | REDUNDANT_DIST_MAT = True |
1336 | DO_HIRE_RNA_MEASURES = False | 1340 | DO_HIRE_RNA_MEASURES = False |
1337 | RESCAN_GMM_COMP_NUM = False | 1341 | RESCAN_GMM_COMP_NUM = False |
1342 | + DO_STAT_POTENTIAL_MEASURES = False | ||
1338 | try: | 1343 | try: |
1339 | opts, _ = getopt.getopt( sys.argv[1:], "r:h", | 1344 | opts, _ = getopt.getopt( sys.argv[1:], "r:h", |
1340 | [ "help", "from-scratch", "wadley", "distance-matrices", "non-redundant", "resolution=", | 1345 | [ "help", "from-scratch", "wadley", "distance-matrices", "non-redundant", "resolution=", |
1341 | - "3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes" ]) | 1346 | + "3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes", "stat-potential" ]) |
1342 | except getopt.GetoptError as err: | 1347 | except getopt.GetoptError as err: |
1343 | print(err) | 1348 | print(err) |
1344 | sys.exit(2) | 1349 | sys.exit(2) |
... | @@ -1364,6 +1369,7 @@ if __name__ == "__main__": | ... | @@ -1364,6 +1369,7 @@ if __name__ == "__main__": |
1364 | print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.") | 1369 | print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.") |
1365 | print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model,\n\t\t\t\t and plot GMMs on the data.") | 1370 | print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model,\n\t\t\t\t and plot GMMs on the data.") |
1366 | print("--rescan-nmodes\t\t\tDo not assume the number of modes in distances and angles distributions, measure it.") | 1371 | print("--rescan-nmodes\t\t\tDo not assume the number of modes in distances and angles distributions, measure it.") |
1372 | + print("--stat-potential\t\t\tCompute statistical potentials based on averaging and quasi-chemical approximation methods.") | ||
1367 | sys.exit() | 1373 | sys.exit() |
1368 | elif opt == "--version": | 1374 | elif opt == "--version": |
1369 | print("RNANet statistics 1.6 beta") | 1375 | print("RNANet statistics 1.6 beta") |
... | @@ -1407,7 +1413,14 @@ if __name__ == "__main__": | ... | @@ -1407,7 +1413,14 @@ if __name__ == "__main__": |
1407 | RESCAN_GMM_COMP_NUM = True | 1413 | RESCAN_GMM_COMP_NUM = True |
1408 | elif opt == "--non-redundant": | 1414 | elif opt == "--non-redundant": |
1409 | REDUNDANT_DIST_MAT = False | 1415 | REDUNDANT_DIST_MAT = False |
1410 | - | 1416 | + elif opt == "--stat-potential": |
1417 | + DO_STAT_POTENTIAL_MEASURES = True | ||
1418 | + os.makedirs(runDir + "/results/geometry/Pyle/distances_i_i+1/", exist_ok=True) | ||
1419 | + os.makedirs(runDir + "/results/geometry/all-atoms/distances_classes/", exist_ok=True) | ||
1420 | + os.makedirs(runDir + "/results/geometry/all-atoms/atom_count/", exist_ok=True) | ||
1421 | + os.makedirs(runDir + "/results/statistical_potential/ratio/Pyle/", exist_ok=True) | ||
1422 | + os.makedirs(runDir + "/results/statistical_potential/ratio/all-atoms/", exist_ok=True) | ||
1423 | + os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/", exist_ok=True) | ||
1411 | # Load mappings. famlist will contain only families with structures at this resolution threshold. | 1424 | # Load mappings. famlist will contain only families with structures at this resolution threshold. |
1412 | 1425 | ||
1413 | print("Loading mappings list...") | 1426 | print("Loading mappings list...") |
... | @@ -1420,19 +1433,19 @@ if __name__ == "__main__": | ... | @@ -1420,19 +1433,19 @@ if __name__ == "__main__": |
1420 | WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd' | 1433 | WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd' |
1421 | GROUP BY rfam_acc; | 1434 | GROUP BY rfam_acc; |
1422 | """, conn) | 1435 | """, conn) |
1423 | - families.drop(families[families.n_chains == 0].index, inplace=True) | 1436 | + # families.drop(families[families.n_chains == 0].index, inplace=True) |
1424 | - mappings_list = {} | 1437 | + # mappings_list = {} |
1425 | - for k in families.rfam_acc: | 1438 | + # for k in families.rfam_acc: |
1426 | - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id | 1439 | + # mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id |
1427 | - FROM chain JOIN structure ON chain.structure_id=structure.pdb_id | 1440 | + # FROM chain JOIN structure ON chain.structure_id=structure.pdb_id |
1428 | - WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ] | 1441 | + # WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ] |
1429 | - famlist = families.rfam_acc.tolist() | 1442 | + # famlist = families.rfam_acc.tolist() |
1430 | - ignored = families[families.n_chains < 3].rfam_acc.tolist() | 1443 | + # ignored = families[families.n_chains < 3].rfam_acc.tolist() |
1431 | - famlist.sort(key=family_order) | 1444 | + # famlist.sort(key=family_order) |
1432 | - | 1445 | + |
1433 | - print(f"Found {len(famlist)} families with chains or better.") | 1446 | + # print(f"Found {len(famlist)} families with chains or better.") |
1434 | - if len(ignored): | 1447 | + # if len(ignored): |
1435 | - print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | 1448 | + # print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') |
1436 | 1449 | ||
1437 | if DELETE_OLD_DATA: | 1450 | if DELETE_OLD_DATA: |
1438 | for f in famlist: | 1451 | for f in famlist: |
... | @@ -1456,9 +1469,9 @@ if __name__ == "__main__": | ... | @@ -1456,9 +1469,9 @@ if __name__ == "__main__": |
1456 | joblist = [] | 1469 | joblist = [] |
1457 | 1470 | ||
1458 | # Do eta/theta plots | 1471 | # Do eta/theta plots |
1459 | - if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 1472 | + # if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
1460 | - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) | 1473 | + # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
1461 | - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) | 1474 | + # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
1462 | 1475 | ||
1463 | # Do distance matrices for each family excl. LSU/SSU (will be processed later) | 1476 | # Do distance matrices for each family excl. LSU/SSU (will be processed later) |
1464 | if DO_AVG_DISTANCE_MATRIX: | 1477 | if DO_AVG_DISTANCE_MATRIX: |
... | @@ -1474,23 +1487,23 @@ if __name__ == "__main__": | ... | @@ -1474,23 +1487,23 @@ if __name__ == "__main__": |
1474 | joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, False, REDUNDANT_DIST_MAT, False))) | 1487 | joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, False, REDUNDANT_DIST_MAT, False))) |
1475 | 1488 | ||
1476 | # Do general family statistics | 1489 | # Do general family statistics |
1477 | - joblist.append(Job(function=stats_len)) # Computes figures about chain lengths | 1490 | + # joblist.append(Job(function=stats_len)) # Computes figures about chain lengths |
1478 | - joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) | 1491 | + # joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) |
1479 | - for f in famlist: | 1492 | + # for f in famlist: |
1480 | - joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) | 1493 | + # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) |
1481 | - if f not in ignored: | 1494 | + # if f not in ignored: |
1482 | - joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) | 1495 | + # joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) |
1483 | 1496 | ||
1484 | 1497 | ||
1485 | # Do geometric measures | 1498 | # Do geometric measures |
1486 | if n_unmapped_chains: | 1499 | if n_unmapped_chains: |
1487 | os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True) | 1500 | os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True) |
1488 | structure_list = representatives_from_nrlist(res_thr) | 1501 | structure_list = representatives_from_nrlist(res_thr) |
1489 | - for f in structure_list: | 1502 | + for f in structure_list[:10]: |
1490 | if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): | 1503 | if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): |
1491 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances | 1504 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances |
1492 | 1505 | ||
1493 | - process_jobs(joblist) | 1506 | + # process_jobs(joblist) |
1494 | 1507 | ||
1495 | # Now process the memory-heavy tasks family by family | 1508 | # Now process the memory-heavy tasks family by family |
1496 | if DO_AVG_DISTANCE_MATRIX: | 1509 | if DO_AVG_DISTANCE_MATRIX: |
... | @@ -1511,31 +1524,55 @@ if __name__ == "__main__": | ... | @@ -1511,31 +1524,55 @@ if __name__ == "__main__": |
1511 | 1524 | ||
1512 | # finish the work after the parallel portions | 1525 | # finish the work after the parallel portions |
1513 | 1526 | ||
1514 | - per_chain_stats() # per chain base frequencies and basepair types | 1527 | + # per_chain_stats() # per chain base frequencies and basepair types |
1515 | - seq_idty() # identity matrices from pre-computed .npy matrices | 1528 | + # seq_idty() # identity matrices from pre-computed .npy matrices |
1516 | - stats_pairs() | 1529 | + # stats_pairs() |
1517 | if n_unmapped_chains: | 1530 | if n_unmapped_chains: |
1518 | - general_stats() | 1531 | + # general_stats() |
1519 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) | 1532 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) |
1520 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) | 1533 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) |
1521 | - concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers) | 1534 | + # concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers) |
1522 | if DO_HIRE_RNA_MEASURES: | 1535 | if DO_HIRE_RNA_MEASURES: |
1523 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv', nworkers) | 1536 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv', nworkers) |
1524 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv', nworkers) | 1537 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv', nworkers) |
1525 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv', nworkers) | 1538 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv', nworkers) |
1526 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv', nworkers) | 1539 | concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv', nworkers) |
1527 | - if DO_WADLEY_ANALYSIS: | 1540 | + # if DO_WADLEY_ANALYSIS: |
1528 | - concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers) | 1541 | + # concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers) |
1529 | - concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers) | 1542 | + # concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers) |
1543 | + # if DO_STAT_POTENTIAL_MEASURES: | ||
1544 | + # with sqlite3.connect(runDir + '/results/Stat_potential.db') as conn: | ||
1545 | + # sql_new_table(conn) | ||
1546 | + # joblist=[] | ||
1547 | + # joblist.append(Job(function=count_occur_atom_dist, args=(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv'))) | ||
1548 | + # joblist.append(Job(function=mole_fraction, args=(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv'))) | ||
1549 | + # process_jobs(joblist) | ||
1550 | + # compute_ratio_from_csv(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv') | ||
1551 | + # if DO_WADLEY_ANALYSIS: | ||
1552 | + # concat_dataframes(runDir + '/results/geometry/Pyle/distances_i_i+1/', 'distances.csv', nworkers) | ||
1530 | joblist = [] | 1553 | joblist = [] |
1531 | - joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,))) | 1554 | + # joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,))) |
1532 | - joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr))) | 1555 | + # joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr))) |
1533 | if DO_HIRE_RNA_MEASURES: | 1556 | if DO_HIRE_RNA_MEASURES: |
1534 | joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM,))) | 1557 | joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM,))) |
1535 | joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM,))) | 1558 | joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM,))) |
1536 | - if DO_WADLEY_ANALYSIS: | 1559 | + # if DO_WADLEY_ANALYSIS: |
1537 | - joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr))) | 1560 | + # joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr))) |
1538 | - process_jobs(joblist) | 1561 | + |
1539 | - merge_jsons(DO_HIRE_RNA_MEASURES) | 1562 | + if DO_STAT_POTENTIAL_MEASURES: |
1563 | + methods = ['avg', 'qch'] | ||
1564 | + os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/avg_method/", exist_ok=True) | ||
1565 | + os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/qch_method/", exist_ok=True) | ||
1566 | + # save_into_database() | ||
1567 | + # for method in methods: | ||
1568 | + # joblist.append(Job(function=stat_potential, args=(runDir + '/results/statistical_potential/ratio/all-atoms/' + method + '_ratio_pobs_pref.csv', method))) | ||
1569 | + if DO_WADLEY_ANALYSIS: | ||
1570 | + os.makedirs(runDir + "/results/statistical_potential/figures/Pyle/", exist_ok=True) | ||
1571 | + os.makedirs(runDir + "/results/statistical_potential/json/Pyle/", exist_ok=True) | ||
1572 | + # joblist.append(Job(function=gmm_pyle_per_type, args=(RESCAN_GMM_COMP_NUM,))) | ||
1573 | + # gmm_pyle_per_type(RESCAN_GMM_COMP_NUM) | ||
1574 | + stat_potential_pyle() | ||
1575 | + # process_jobs(joblist) | ||
1576 | + # merge_jsons(DO_HIRE_RNA_MEASURES) | ||
1540 | 1577 | ||
1541 | 1578 | ... | ... |
-
Please register or login to post a comment