Aglaé TABOT

connections between statistics.py and statistical_potential.py and some code mod…

…ifications and optimizations
#!/usr/bin/python3
# RNANet statistics
# Developed by Aglaé Tabot, 2021
# This file computes statistical potentials over the produced dataset.
# THIS FILE IS NOT SUPPOSED TO BE RUN DIRECTLY.
import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
import time
......@@ -13,88 +20,37 @@ from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
from multiprocessing import Pool, Manager
from os import path
from tqdm import tqdm
from collections import Counter
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions
from sklearn.mixture import GaussianMixture
import scipy.stats as st
import warnings
from pandas.core.common import SettingWithCopyWarning
from itertools import combinations_with_replacement
from math import *
# from geometric_stats import get_euclidian_distance
from geometric_stats import get_euclidian_distance, GMM_histo
np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
runDir = os.getcwd()
def liste_repres(fpath):
repres=[]
df=pd.read_csv(os.path.abspath(fpath))
for i in range(df.shape[0]):
up_name=df["representative"][i]
if '+' in up_name:
up_name=up_name.split('+')
for i in range(len(up_name)):
chain=up_name[i].split('|')
chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2]
repres.append(chain+'.cif')
else :
up_name=up_name.split('|')
low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2]
repres.append(low_name+'.cif')
return repres
def measure_from_structure(f): # ou alors on le lance à partir de statistics.py?
"""
Do geometric measures required on a given filename
"""
name = f.split('.')[0]
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_from_structure({f})")
# Open the structure
with warnings.catch_warnings():
# Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
parser=MMCIFParser()
s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
measures_heavy_atoms(name, s, thr_idx)
if DO_WADLEY_ANALYSIS:
pyle_measures(name, s, thr_idx)
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
def pyle_measures(name, s, thr_idx):
@trace_unhandled_exceptions
def pyle_measures_for_potentials(name, s, thr_idx):
# measure distances P-P, P-C1', P-C4', C1'-C1', C4'-C4'
# between residues
# along the chain
if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')):
# between residues along the chain
# Requires a lot of storage space!
if (path.isfile(runDir + '/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1'+name+'.csv')):
return
liste_dist=[]
#classes=[]
#for i in range(0, 150, 5):
#classes.append([i, i+5])
#classes.append([150, 300])
#occur_p_p=len(classes)*[0]
#occur_p_c1=len(classes)*[0]
#occur_p_c4=len(classes)*[0]
#occur_c1_c1=len(classes)*[0]
#occur_c4_c4=len(classes)*[0]
#nb_occurs=[]
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")
chain = next(s[0].get_chains())
#residues=list(chain.get_residues())
for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
#res1=chain[i]
if res1.get_resname() in ["A", "C", "G", "U"]:
resnum1=list(res1.get_id())[1]
atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"]
......@@ -109,7 +65,7 @@ def pyle_measures(name, s, thr_idx):
p_c1p=np.nan
c4p_c4p=np.nan
c1p_c1p=np.nan
#res2=chain[j]
if res2.get_resname() in ["A", "C", "G", "U"]:
atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"]
......@@ -123,94 +79,218 @@ def pyle_measures(name, s, thr_idx):
c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)
liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])
'''
for x in range(len(classes)):
if classes[x][0] <= p_p <= classes[x][1]:
occur_p_p[x]=occur_p_p[x]+1
if classes[x][0] <= p_c4p <= classes[x][1]:
occur_p_c4[x]=occur_p_c4[x]+1
if classes[x][0] <= p_c1p <= classes[x][1]:
occur_p_c1[x]=occur_p_c1[x]+1
if classes[x][0] <= c4p_c4p <= classes[x][1]:
occur_c4_c4[x]=occur_c4_c4[x]+1
if classes[x][0] <= c1p_c1p <= classes[x][1]:
occur_c1_c1[x]=occur_c1_c1[x]+1
'''
#for x in range(len(classes)):
# for i in range(len(liste_dist)):
# if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
# occur_p_p[x]=occur_p_p[x]+1
# if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
# occur_p_c4[x]=occur_p_c4[x]+1
# if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
# occur_p_c1[x]=occur_p_c1[x]+1
# if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
# occur_c4_c4[x]=occur_c4_c4[x]+1
# if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
# occur_c1_c1[x]=occur_c1_c1[x]+1
#nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
#df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
# return df
# nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
# print(nb_occurs)
# return nb_occurs
df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv")
df.to_csv(runDir + "/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1" + name + ".csv")
def gmm_pyle_type(ntpair, data):
@trace_unhandled_exceptions
def gmm_pyle_type(ntpair, data, scan):
setproctitle(f"GMM (Pyle {ntpair} )")
os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
os.makedirs(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/")
p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
#print(len(p_p))
# res2=list(data["resnum2"])
# res1=list(data["resnum1"])
# diff=[]
# for i in range(len(res1)):
# diff.append(res2[i]-res1[i])
# print(diff[:100])
GMM_histo(p_p, f"Distance P-P between {ntpair}", toric=False, hist=False, col="cyan")
GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", toric=False, hist=False, col="tomato")
GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", toric=False, hist=False, col="goldenrod")
GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", toric=False, hist=False, col="magenta")
GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", toric=False, hist=False, col="black")
# GMM_histo(diff, f"Gap between {ntpair} ", toric=False, hist=False, col="tomato")
GMM_histo(p_p, f"Distance P-P between {ntpair}", scan, toric=False, hist=False, col="cyan")
GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", scan, toric=False, hist=False, col="tomato")
GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", scan, toric=False, hist=False, col="goldenrod")
GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", scan, toric=False, hist=False, col="magenta")
GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", scan, toric=False, hist=False, col="black")
plt.xlabel("Distance (Angströms)")
# plt.xlabel("Number of residues")
#plt.ylabel("Distance (Angströms)")
plt.title(f"GMM of distances for {ntpair} ", fontsize=10)
# plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
plt.savefig(f"Distances_Pyle_{ntpair}.png" )
plt.savefig(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/" + f"Distances_Pyle_{ntpair}.png" )
plt.close()
setproctitle(f"GMM (Pyle {ntpair} distances) finished")
def gmm_pyle():
@trace_unhandled_exceptions
def gmm_pyle_per_type(scan):
setproctitle("GMM (Pyle model)")
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv"))
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances_i_i+1/distances.csv"))
# dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
data=df
if len(data):
for b1 in ['A','C','G','U']:
for b2 in ['A','C','G','U']:
thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
if len(thisbases):
gmm_pyle_type(b1+b2, thisbases)
gmm_pyle_type(b1+b2, thisbases, scan)
setproctitle("GMM (Pyle model) finished")
# The next 7 functions are used to calculate the statistical potentials with the averaging method (residue-averaging)
# for the Pyle model from the GMM results, and to plot the corresponding figures
def pgaussred(x):
"""
distribution function of the normal distribution (= probability that a random variable distributed according to is less than x)
calculation by numerical integration: 2000 slices per standard deviation
result rounded to 7 decimal places
"""
if x==0:
return 0.5
u=abs(x)
n=int(u*2000)
du=u/n
k=1/sqrt(2*pi)
u1=0
f1=k
p=0.5
for i in range(0,n):
u2=u1+du
f2=k*exp(-0.5*u2*u2)
p=p+(f1+f2)*du*0.5
u1=u2
f1=f2
if x<0:
p = 1.0-p
return round(p, 7)
def proba(m, s, xinf, xsup):
"""
calculates the probability for a value to belong to the interval [xinf, xsup]
for a normal distribution with mean m and standard deviation s
"""
prob=pgaussred((xsup-m)/s)-pgaussred((xinf-m)/s)
return prob
def extract_from_json(data, xinf, xsup):
"""
extracts the means and standard deviations of the json obtained with the GMM calculations
and calculates from these parameters the probability for a value to belong to the interval [xinf, xsup].
"""
p=[]
p_tot=0
classes=[]
with open(data + '.json') as json_data:
data_dict = json.load(json_data)
for i in range(len(data_dict['means'])):
mean = data_dict['means'][i]
if mean[0] == '[':
mean=float((mean.split('['))[1].split(']')[0])
else :
mean=float(mean)
std = data_dict['std'][i]
if std[0] == '[':
std=float((std.split('['))[2].split(']')[0])
else:
std=float(std)
prob=proba(mean, std, xinf, xsup)
p.append(prob)
for x in p:
p_tot = p_tot+x
return p_tot
def averaging(liste_data, name):
"""
creates a json that contains all the means and standard deviations of the parameters
that we want to use to create the reference distribution with the averaging method
"""
curves=[]
x = np.linspace(0,250,1000)
summary_data = {}
summary_data["measures"] = []
summary_data["means"] = []
summary_data["std"] = []
for data in liste_data:
s=0
mean_tot=0
std_tot=0
with open(runDir + '/results/geometry/json/' + data + '.json') as json_data:
data_dict = json.load(json_data)
for i in range(len(data_dict['means'])):
mean = float(data_dict['means'][i])
std = float(data_dict['std'][i])
weight = float(data_dict['weights'][i])
y = weight*st.norm.pdf(x, mean, std)
mean_tot=mean_tot+(weight*mean)
std_tot=std_tot+(weight*std)
curves.append(y)
summary_data["measures"].append(data)
summary_data["means"].append(str(mean_tot))
summary_data["std"].append(str(std_tot))
with open(runDir + '/results/statistical_potential/json/Pyle/avg_' +name + ".json", 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=4)
def potential_dist(data_obs, data_ref):
name=data_obs.split('/')[-1]
l=[]
for k in range(0, 400, 5):
obs=extract_from_json(data_obs, k, k+5)
ref=extract_from_json(data_ref, k, k+5)
if obs != 0.0 and ref != 0.0:
u = -log(obs/ref)
l.append([k, k+5, obs, ref, u])
l.append([k+5, k+5, obs, ref, u])
else:
l.append([k, k+5, obs, ref, None])
l.append([k+5, k+5, obs, ref, None])
df=pd.DataFrame(l, columns=["Xinf", "Xsup", "Pobs", "Pref", "- ln(Pobs/Pref)"])
df.to_csv(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + name + ".csv")
def courbe_stat_pot(f):
name=f.split('/')[-1]
name=name.split('.')[0]
df=pd.read_csv(f)
E=list(df["- ln(Pobs/Pref)"][~ np.isnan(df["- ln(Pobs/Pref)"])])
max_E=max(E)
min_E=min(E)
index_with_nan=df.index[df.iloc[:,5].isnull()]
for i in index_with_nan:
df.iloc[i, 5]=max_E # we replace the nan by the maximum found energy
y=list(df["- ln(Pobs/Pref)"])
x=list(df["Xinf"])
x=np.array(x)
y=np.array(y)
plt.plot(x, y)
plt.xlabel("Distance (Angström)")
plt.ylabel("- ln(Pobs/Pref)")
plt.title(name)
plt.savefig(runDir + "/results/statistical_potential/figures/Pyle/avg_statistical_pot_ " + name + ".png")
plt.close()
def stat_potential_pyle():
pyle = ["P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"]
nt = ["A", "C", "G", "U"]
for pair in pyle:
type_list=[]
for res1 in nt:
for res2 in nt:
setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2})")
type_list.append(f"Distance {pair} between {res1}{res2}")
averaging(type_list, f"{pair}")
for dist in type_list:
potential_dist(runDir + '/results/geometry/json/' + dist, runDir + f'/results/statistical_potential/json/Pyle/avg_{pair}')
courbe_stat_pot(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + dist + '.csv')
setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2}) finished")
@trace_unhandled_exceptions
def measures_heavy_atoms(name, s, thr_idx):
"""
create a list of all possible pairs of atoms among the 85 types of atoms
......@@ -312,7 +392,7 @@ def measures_heavy_atoms(name, s, thr_idx):
df_occur = pd.DataFrame(occurences, columns=["atom_pair_type"] + classes_str)
# save this
df_occur.to_csv(runDir + "/results/geometry/all-atoms/distances_classes/distances_classes_occur_" + name + ".csv")
@trace_unhandled_exceptions
def count_occur_atom_dist(fpath, outfilename):
"""
After having calculated the number of occurrences of the distances between pairs of atoms
......@@ -320,13 +400,9 @@ def count_occur_atom_dist(fpath, outfilename):
we add these occurrences one by one to obtain a single dataframe with the total number of occurrences
"""
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"Worker {thr_idx+1} : Addition of occurences of {fpath}")
setproctitle(f"Addition of occurences of {fpath}")
liste=os.listdir(fpath)
pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
for f in range(len(liste)):
......@@ -334,23 +410,20 @@ def count_occur_atom_dist(fpath, outfilename):
for i in range(df.shape[0]):
for j in range(2, df.shape[1]):
df_tot.iloc[i, j]=df_tot.iloc[i, j] + df.iloc[i, j]
pbar.update(1)
df_tot.to_csv(fpath+outfilename)
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
setproctitle(f"Addition of occurences of {fpath} finished")
@trace_unhandled_exceptions
def mole_fraction(fpath, outfilename):
"""
Calculation of the mole fraction of each type of atom within the set of structures
"""
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"Worker {thr_idx+1} : Calculation of mole fractions of {fpath}")
setproctitle(f"Calculation of mole fractions of {fpath}")
liste=os.listdir(fpath)
pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
del df_tot["Unnamed: 0"]
......@@ -359,31 +432,29 @@ def mole_fraction(fpath, outfilename):
del df["Unnamed: 0"]
for i in range(df.shape[0]):
df_tot.iloc[i, 1]=df_tot.iloc[i, 1] + df.iloc[i, 1]
pbar.update(1)
total=sum(list(df_tot["count"]))
fract=[]
for i in range(df_tot.shape[0]):
# df_tot.iloc[i, 1]=df_tot.iloc[i, 1]/total
fract.append(df_tot.iloc[i, 1]/total)
df_tot["mole_fraction"]=fract
file_list=os.listdir(fpath)
file_list=[fpath+x for x in file_list]
# file_list=os.listdir(fpath)
# file_list=[fpath+x for x in file_list]
# for f in file_list: #after processing, deletion of csv by structure
# os.remove(f)
df_tot.to_csv(fpath+outfilename) #ou alors juste un return
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
df_tot.to_csv(fpath+outfilename)
setproctitle(f"Calculation of mole fractions of {fpath} finished")
@trace_unhandled_exceptions
def compute_ratio_from_csv(fpath, avg_file, qch_file):
"""
Calculation of observed and reference probabilities
according to the methods chosen to establish the reference state
Then calculation of the Pobs / Pref ratio
"""
# global idxQueue
# thr_idx = idxQueue.get()
# setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}")
setproctitle("Compute ratio from csv")
df = pd.read_csv(fpath)
del df['Unnamed: 0']
del df['Unnamed: 0.1']
......@@ -401,9 +472,8 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
s_tot=sum(sommes)
for s in sommes:
pref_avg_list.append(s/s_tot)
print(sommes)
# return
# pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+avg_file, leave=False)
df_bis=df.copy()
# if method=averaging
for i in range(df.shape[0]):
......@@ -412,14 +482,7 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
df_bis.iloc[i,j]=df.iloc[i,j]/(sum(df.iloc[i, k] for k in range(1, df.shape[1])))
# ratio between the observed probability and the reference probability
df_bis.iloc[i,j]=df_bis.iloc[i,j]/pref_avg_list[j-1]
# pbar.update(1)
# idxQueue.put(thr_idx) # replace the thread index in the queue
# setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
print(df_bis)
# thr_idx = idxQueue.get()
# setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}")
# pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+qch_file, leave=False)
# if method=quasi-chemical
df_ter=df.copy()
......@@ -430,20 +493,18 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
# calculate the mole fractions of the atom corresponding to the pair in row i
if atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[0] or atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[1]:
x.append(atom_count.at[k, 'mole_fraction'])
# print(x)
for j in range(1, df.shape[1]):
if len(x)==2:
df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[1]*sommes[j-1]) # ratio for qchA method (Nijobs(r)/xi*xj*Nobs(r))
if len(x)==1:
df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[0]*sommes[j-1])
# pbar.update(1)
# idxQueue.put(thr_idx) # replace the thread index in the queue
# setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
print(df_ter)
# ajouter ce qui est dans la fct save_into_database ici?
df_bis.to_csv(runDir + '/results/statistical_potential/all-atoms/' + avg_file)
df_ter.to_csv(runDir + '/results/statistical_potential/all-atoms/' + qch_file)
df_bis.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + avg_file)
df_ter.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + qch_file)
setproctitle("Compute ratio from csv finished")
@trace_unhandled_exceptions
def sql_new_table(conn):
cur = conn.cursor()
sql = """ CREATE TABLE IF NOT EXISTS all_atoms (
......@@ -482,10 +543,11 @@ def sql_new_table(conn):
conn.execute("pragma journal_mode=wal")
#conn.close()
@trace_unhandled_exceptions
def save_into_database():
df_avg = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/avg_ratio_pobs_pref.csv')
df_qch = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/qch_ratio_pobs_pref.csv')
setproctitle("Saving statistical potentials(avg, qch) into database")
df_avg = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/avg_ratio_pobs_pref.csv')
df_qch = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/qch_ratio_pobs_pref.csv')
ratio_list=[]
del df_avg['Unnamed: 0']
del df_qch['Unnamed: 0']
......@@ -495,7 +557,7 @@ def save_into_database():
ratio_list.append([df_avg.iloc[i,0], df_avg.columns[j], df_avg.iloc[i, j], df_qch.iloc[i,j]])
with sqlite3.connect(runDir + "/results/Stats.db", timeout=20.0) as conn:
with sqlite3.connect(runDir + "/results/Stat_potential.db", timeout=20.0) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
# We use the REPLACE keyword to get the latest information
sql_execute(conn, """INSERT OR REPLACE INTO all_atoms (atom_pair, distance_bin, avg_ratio_pobs_pref, qch_ratio_pobs_pref )
......@@ -503,12 +565,15 @@ def save_into_database():
many=True,
data=ratio_list
)
def stat_potential(pair, f, method):
setproctitle("Saving statistical potentials(avg, qch) into database finished")
@trace_unhandled_exceptions
def stat_potential(f, method):
df=pd.read_csv(f)
del df['Unnamed: 0']
df.set_index('atom_pair_type', inplace=True)
df=df.T
setproctitle(f"RNANet statistics.py stat_potential({method})")
for pair in df.columns:
c=df[pair].tolist()
new=[]
for x in c:
......@@ -528,8 +593,6 @@ def stat_potential(pair, f, method):
abs.append(i+5)
abs.append(150)
abs.append(300)
print(new)
print(abs)
x=abs
y=new
......@@ -539,36 +602,12 @@ def stat_potential(pair, f, method):
plt.xlabel('Distance')
plt.ylabel("- ln(Pobs/Pref)")
plt.title(f'Statistical potential of {pair} distance ({method} method)')
plt.savefig(f"/home/atabot/RNANet/results/statistical_potential/all-atoms/{method}_statistical_pot_{pair}.png")
plt.savefig(runDir + f"/results/statistical_potential/figures/all-atoms/{method}_method/{method}_statistical_pot_{pair}.png")
plt.close()
# mettre en option le choix de la méthode?
setproctitle(f"RNANet statistics.py stat_potential({method}) finished")
if __name__ == "__main__":
os.makedirs(runDir + '/results/statistical_potential/all-atoms/', exist_ok=True)
# compute_ratio_from_csv('/home/atabot/RNANet/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv')
# exit(1)
with sqlite3.connect(runDir + '/results/Stats.db') as conn:
sql_new_table(conn)
save_into_database()
exit(1)
# count_occur_atom_dist(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv')
# os.makedirs(runDir + '/results/figures/all-atoms/distances/hist/', exist_ok=True)
# histo_occur(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv')
# exit(1)
mole_fraction(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv')
exit(1)
# print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
if n_unmapped_chains:
# os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
os.makedirs(runDir+"/results/geometry/all-atoms/distances_classes/", exist_ok=True)
os.makedirs(runDir+"/results/geometry/all-atoms/atom_count/", exist_ok=True)
# liste_struct = os.listdir(path_to_3D_data + "rna_only")
liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')[:100]
# measure_from_structure(liste_struct[0])
# exit(1)
#measure_from_structure('5e81_1_2K.cif')
#exit(1)
# concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')
\ No newline at end of file
print("This file is not supposed to be run directly. Run statistics.py instead.")
......
......@@ -38,6 +38,7 @@ LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam
SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
from geometric_stats import * # after definition of the variables
from statistical_potential import *
@trace_unhandled_exceptions
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
......@@ -1181,13 +1182,16 @@ def measure_from_structure(f):
s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
#pyle_measures(name, s, thr_idx)
measures_aa(name, s, thr_idx)
# measures_aa(name, s, thr_idx)
if DO_HIRE_RNA_MEASURES:
measures_hrna(name, s, thr_idx)
measures_hrna_basepairs(name, s, path_to_3D_data, thr_idx)
if DO_WADLEY_ANALYSIS:
measures_pyle(name, s, thr_idx)
if DO_STAT_POTENTIAL_MEASURES:
# measures_heavy_atoms(name, s, thr_idx)
if DO_WADLEY_ANALYSIS:
pyle_measures_for_potentials(name, s, thr_idx)
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
......@@ -1335,10 +1339,11 @@ if __name__ == "__main__":
REDUNDANT_DIST_MAT = True
DO_HIRE_RNA_MEASURES = False
RESCAN_GMM_COMP_NUM = False
DO_STAT_POTENTIAL_MEASURES = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h",
[ "help", "from-scratch", "wadley", "distance-matrices", "non-redundant", "resolution=",
"3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes" ])
"3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes", "stat-potential" ])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -1364,6 +1369,7 @@ if __name__ == "__main__":
print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model,\n\t\t\t\t and plot GMMs on the data.")
print("--rescan-nmodes\t\t\tDo not assume the number of modes in distances and angles distributions, measure it.")
print("--stat-potential\t\t\tCompute statistical potentials based on averaging and quasi-chemical approximation methods.")
sys.exit()
elif opt == "--version":
print("RNANet statistics 1.6 beta")
......@@ -1407,7 +1413,14 @@ if __name__ == "__main__":
RESCAN_GMM_COMP_NUM = True
elif opt == "--non-redundant":
REDUNDANT_DIST_MAT = False
elif opt == "--stat-potential":
DO_STAT_POTENTIAL_MEASURES = True
os.makedirs(runDir + "/results/geometry/Pyle/distances_i_i+1/", exist_ok=True)
os.makedirs(runDir + "/results/geometry/all-atoms/distances_classes/", exist_ok=True)
os.makedirs(runDir + "/results/geometry/all-atoms/atom_count/", exist_ok=True)
os.makedirs(runDir + "/results/statistical_potential/ratio/Pyle/", exist_ok=True)
os.makedirs(runDir + "/results/statistical_potential/ratio/all-atoms/", exist_ok=True)
os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/", exist_ok=True)
# Load mappings. famlist will contain only families with structures at this resolution threshold.
print("Loading mappings list...")
......@@ -1420,19 +1433,19 @@ if __name__ == "__main__":
WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd'
GROUP BY rfam_acc;
""", conn)
families.drop(families[families.n_chains == 0].index, inplace=True)
mappings_list = {}
for k in families.rfam_acc:
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id
FROM chain JOIN structure ON chain.structure_id=structure.pdb_id
WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
famlist = families.rfam_acc.tolist()
ignored = families[families.n_chains < 3].rfam_acc.tolist()
famlist.sort(key=family_order)
print(f"Found {len(famlist)} families with chains or better.")
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
# families.drop(families[families.n_chains == 0].index, inplace=True)
# mappings_list = {}
# for k in families.rfam_acc:
# mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id
# FROM chain JOIN structure ON chain.structure_id=structure.pdb_id
# WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
# famlist = families.rfam_acc.tolist()
# ignored = families[families.n_chains < 3].rfam_acc.tolist()
# famlist.sort(key=family_order)
# print(f"Found {len(famlist)} families with chains or better.")
# if len(ignored):
# print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
if DELETE_OLD_DATA:
for f in famlist:
......@@ -1456,9 +1469,9 @@ if __name__ == "__main__":
joblist = []
# Do eta/theta plots
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
# if n_unmapped_chains and DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
# joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
# Do distance matrices for each family excl. LSU/SSU (will be processed later)
if DO_AVG_DISTANCE_MATRIX:
......@@ -1474,23 +1487,23 @@ if __name__ == "__main__":
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, False, REDUNDANT_DIST_MAT, False)))
# Do general family statistics
joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
for f in famlist:
joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
if f not in ignored:
joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
# joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
# joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
# for f in famlist:
# joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
# if f not in ignored:
# joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
# Do geometric measures
if n_unmapped_chains:
os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True)
structure_list = representatives_from_nrlist(res_thr)
for f in structure_list:
for f in structure_list[:10]:
if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
process_jobs(joblist)
# process_jobs(joblist)
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
......@@ -1511,31 +1524,55 @@ if __name__ == "__main__":
# finish the work after the parallel portions
per_chain_stats() # per chain base frequencies and basepair types
seq_idty() # identity matrices from pre-computed .npy matrices
stats_pairs()
# per_chain_stats() # per chain base frequencies and basepair types
# seq_idty() # identity matrices from pre-computed .npy matrices
# stats_pairs()
if n_unmapped_chains:
general_stats()
# general_stats()
os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers)
# concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers)
if DO_HIRE_RNA_MEASURES:
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv', nworkers)
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv', nworkers)
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv', nworkers)
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv', nworkers)
if DO_WADLEY_ANALYSIS:
concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers)
concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers)
# if DO_WADLEY_ANALYSIS:
# concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers)
# concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers)
# if DO_STAT_POTENTIAL_MEASURES:
# with sqlite3.connect(runDir + '/results/Stat_potential.db') as conn:
# sql_new_table(conn)
# joblist=[]
# joblist.append(Job(function=count_occur_atom_dist, args=(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv')))
# joblist.append(Job(function=mole_fraction, args=(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv')))
# process_jobs(joblist)
# compute_ratio_from_csv(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv')
# if DO_WADLEY_ANALYSIS:
# concat_dataframes(runDir + '/results/geometry/Pyle/distances_i_i+1/', 'distances.csv', nworkers)
joblist = []
joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,)))
joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr)))
# joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,)))
# joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr)))
if DO_HIRE_RNA_MEASURES:
joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM,)))
joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM,)))
# if DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr)))
if DO_STAT_POTENTIAL_MEASURES:
methods = ['avg', 'qch']
os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/avg_method/", exist_ok=True)
os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/qch_method/", exist_ok=True)
# save_into_database()
# for method in methods:
# joblist.append(Job(function=stat_potential, args=(runDir + '/results/statistical_potential/ratio/all-atoms/' + method + '_ratio_pobs_pref.csv', method)))
if DO_WADLEY_ANALYSIS:
joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr)))
process_jobs(joblist)
merge_jsons(DO_HIRE_RNA_MEASURES)
os.makedirs(runDir + "/results/statistical_potential/figures/Pyle/", exist_ok=True)
os.makedirs(runDir + "/results/statistical_potential/json/Pyle/", exist_ok=True)
# joblist.append(Job(function=gmm_pyle_per_type, args=(RESCAN_GMM_COMP_NUM,)))
# gmm_pyle_per_type(RESCAN_GMM_COMP_NUM)
stat_potential_pyle()
# process_jobs(joblist)
# merge_jsons(DO_HIRE_RNA_MEASURES)
......