connections between statistics.py and statistical_potential.py and some code mod…

…ifications and optimizations

connections between statistics.py and statistical_potential.py and some code mod…
…ifications and optimizations
Aglaé TABOT
Commit e1843847f31708a5024fd53ee6f8cab67e96d9a0 e1843847 1 parent ad0e234c
Showing 2 changed files with 319 additions and 243 deletions
statistical_potential.py
statistics.py
--- a/statistical_potential.py
View file @e184384
+++ b/statistical_potential.py
View file @e184384
+ #!/usr/bin/python3
+ 
+ # RNANet statistics
+ # Developed by Aglaé Tabot, 2021 
+ 
+ # This file computes statistical potentials over the produced dataset.
+ # THIS FILE IS NOT SUPPOSED TO BE RUN DIRECTLY.
 
 import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
 import time
@@ -13,88 +20,37 @@ from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
 from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
+ from collections import Counter
 from setproctitle import setproctitle
- from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
+ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions
 from sklearn.mixture import GaussianMixture
+ import scipy.stats as st
 import warnings
 from pandas.core.common import SettingWithCopyWarning
 from itertools import combinations_with_replacement
 from math import *
- # from geometric_stats import get_euclidian_distance
+ from geometric_stats import get_euclidian_distance, GMM_histo
 
 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
 
 runDir = os.getcwd()
 
- 
- def liste_repres(fpath):
-     repres=[]
-     df=pd.read_csv(os.path.abspath(fpath))
-     for i in range(df.shape[0]):
-         up_name=df["representative"][i]
-         if '+' in up_name:
-             up_name=up_name.split('+')
-             for i in range(len(up_name)):
-                 chain=up_name[i].split('|')
-                 chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2]
-                 repres.append(chain+'.cif')
-         else :
-             up_name=up_name.split('|')
-             low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2]
-             repres.append(low_name+'.cif') 
-         
-     return repres
- 
- def measure_from_structure(f): # ou alors on le lance à partir de statistics.py?
-     """
-     Do geometric measures required on a given filename
-     """
- 
-     name = f.split('.')[0]
- 
-     global idxQueue
-     thr_idx = idxQueue.get()
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_from_structure({f})")
- 
-     # Open the structure 
-     with warnings.catch_warnings():
-         # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
-         warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
-         warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
-         parser=MMCIFParser()
-         s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
-     
-     measures_heavy_atoms(name, s, thr_idx)
-     if DO_WADLEY_ANALYSIS:
-         pyle_measures(name, s, thr_idx)
-     
-     idxQueue.put(thr_idx) # replace the thread index in the queue
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
- 
- def pyle_measures(name, s, thr_idx):
+ @trace_unhandled_exceptions
+ def pyle_measures_for_potentials(name, s, thr_idx):
     # measure distances P-P, P-C1', P-C4', C1'-C1', C4'-C4'
-     # between residues 
-     # along the chain
-     if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')):
+     # between residues along the chain
+     # Requires a lot of storage space!
+     if (path.isfile(runDir + '/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1'+name+'.csv')):
         return
 
     liste_dist=[]
-     #classes=[]
-     #for i in range(0, 150, 5):
-         #classes.append([i, i+5])
-     #classes.append([150, 300])
-     #occur_p_p=len(classes)*[0]
-     #occur_p_c1=len(classes)*[0]
-     #occur_p_c4=len(classes)*[0]
-     #occur_c1_c1=len(classes)*[0]
-     #occur_c4_c4=len(classes)*[0]
-     #nb_occurs=[]
+ 
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")
 
     chain = next(s[0].get_chains())
-     #residues=list(chain.get_residues())
+     
     for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
-         #res1=chain[i]
+        
         if res1.get_resname() in ["A", "C", "G", "U"]:
             resnum1=list(res1.get_id())[1]
             atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() ==  "P"]
@@ -109,7 +65,7 @@ def pyle_measures(name, s, thr_idx):
                 p_c1p=np.nan
                 c4p_c4p=np.nan
                 c1p_c1p=np.nan
-                 #res2=chain[j]
+                 
                 if res2.get_resname() in ["A", "C", "G", "U"]:
                     
                     atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() ==  "P"]
@@ -123,94 +79,218 @@ def pyle_measures(name, s, thr_idx):
                     c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)
 
                     liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])
-                     '''
-                     for x in range(len(classes)):
-                         if classes[x][0] <= p_p <= classes[x][1]:
-                             occur_p_p[x]=occur_p_p[x]+1
-                         if classes[x][0] <= p_c4p <= classes[x][1]:
-                             occur_p_c4[x]=occur_p_c4[x]+1                        
-                         if classes[x][0] <= p_c1p <= classes[x][1]:
-                             occur_p_c1[x]=occur_p_c1[x]+1
-                         if classes[x][0] <= c4p_c4p <= classes[x][1]:
-                             occur_c4_c4[x]=occur_c4_c4[x]+1
-                         if classes[x][0] <= c1p_c1p <= classes[x][1]:
-                             occur_c1_c1[x]=occur_c1_c1[x]+1
-                     '''
-     #for x in range(len(classes)):
-         # for i in range(len(liste_dist)):
-         #     if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
-         #         occur_p_p[x]=occur_p_p[x]+1
-         #     if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
-         #         occur_p_c4[x]=occur_p_c4[x]+1                        
-         #     if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
-         #         occur_p_c1[x]=occur_p_c1[x]+1
-         #     if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
-         #         occur_c4_c4[x]=occur_c4_c4[x]+1
-         #     if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
-         #         occur_c1_c1[x]=occur_c1_c1[x]+1
-         #nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
-     #df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
-     # return df
-     # nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
-     # print(nb_occurs)
-     # return nb_occurs
 
                 
     df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
-     df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv")
+     df.to_csv(runDir + "/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1" + name + ".csv")
 
- def gmm_pyle_type(ntpair, data):
+ @trace_unhandled_exceptions
+ def gmm_pyle_type(ntpair, data, scan):
 
     setproctitle(f"GMM (Pyle {ntpair} )")
     
-     os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
-     os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
+     os.makedirs(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/", exist_ok=True)
+     os.chdir(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/")
 
     p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
     p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
     p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
     c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
     c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
-     #print(len(p_p))
-     # res2=list(data["resnum2"])
-     # res1=list(data["resnum1"])
-     # diff=[]
-     # for i in range(len(res1)):
-     #     diff.append(res2[i]-res1[i])
-     # print(diff[:100])
-     
-     GMM_histo(p_p, f"Distance P-P between {ntpair}", toric=False, hist=False, col="cyan")
-     GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", toric=False, hist=False, col="tomato")
-     GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", toric=False, hist=False, col="goldenrod")
-     GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", toric=False, hist=False, col="magenta")
-     GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", toric=False, hist=False, col="black")
-     # GMM_histo(diff, f"Gap between {ntpair} ", toric=False, hist=False, col="tomato")
+     
+     GMM_histo(p_p, f"Distance P-P between {ntpair}", scan, toric=False, hist=False, col="cyan")
+     GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", scan, toric=False, hist=False, col="tomato")
+     GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", scan, toric=False, hist=False, col="goldenrod")
+     GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", scan, toric=False, hist=False, col="magenta")
+     GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", scan, toric=False, hist=False, col="black")
+     
     plt.xlabel("Distance (Angströms)")
     
-     # plt.xlabel("Number of residues")
-     #plt.ylabel("Distance (Angströms)")
     plt.title(f"GMM of distances for {ntpair} ", fontsize=10)
 
-     # plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
-     plt.savefig(f"Distances_Pyle_{ntpair}.png" )
+     plt.savefig(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/" + f"Distances_Pyle_{ntpair}.png" )
     plt.close()
     setproctitle(f"GMM (Pyle {ntpair} distances) finished")
- 
- def gmm_pyle():
+ @trace_unhandled_exceptions
+ def gmm_pyle_per_type(scan):
 
     setproctitle("GMM (Pyle model)")
 
-     df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv"))
+     df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances_i_i+1/distances.csv"))
 
-     # dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
     data=df
     if len(data):
         for b1 in ['A','C','G','U']:
             for b2 in ['A','C','G','U']:
                 thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
                 if len(thisbases):
-                     gmm_pyle_type(b1+b2, thisbases)
+                     gmm_pyle_type(b1+b2, thisbases, scan)
+     setproctitle("GMM (Pyle model) finished")
+ 
+ # The next 7 functions are used to calculate the statistical potentials with the averaging method (residue-averaging)
+ # for the Pyle model from the GMM results, and to plot the corresponding figures
+ 
+ 
+ def pgaussred(x):
+     """
+     distribution function of the normal distribution (= probability that a random variable distributed according to is less than x)
+     calculation by numerical integration: 2000 slices per standard deviation
+     result rounded to 7 decimal places
+     """
+     if x==0:
+         return 0.5
+     u=abs(x)
+     n=int(u*2000)
+     du=u/n
+     k=1/sqrt(2*pi)
+     u1=0
+     f1=k
+     p=0.5
+     for i in range(0,n):
+         u2=u1+du
+         f2=k*exp(-0.5*u2*u2)
+         p=p+(f1+f2)*du*0.5
+         u1=u2
+         f1=f2
+     if x<0:
+         p = 1.0-p
+     return round(p, 7)
+ 
+ def proba(m, s, xinf, xsup):
+     """
+     calculates the probability for a value to belong to the interval [xinf, xsup]
+     for a normal distribution with mean m and standard deviation s
+     """
+     prob=pgaussred((xsup-m)/s)-pgaussred((xinf-m)/s)
 
+     return prob
+ 
+ def extract_from_json(data, xinf, xsup):
+     """
+     extracts the means and standard deviations of the json obtained with the GMM calculations
+     and calculates from these parameters the probability for a value to belong to the interval [xinf, xsup].
+     """
+     p=[]
+     p_tot=0
+     classes=[]
+     with open(data + '.json') as json_data:
+         data_dict = json.load(json_data)
+ 
+     for i in range(len(data_dict['means'])):
+         mean = data_dict['means'][i]
+         if mean[0] == '[':
+             mean=float((mean.split('['))[1].split(']')[0])
+         else :
+             mean=float(mean)
+         std = data_dict['std'][i]
+         if std[0] == '[':
+             std=float((std.split('['))[2].split(']')[0])
+         else:
+             std=float(std)
+         prob=proba(mean, std, xinf, xsup)
+         p.append(prob)
+ 
+     for x in p:
+         p_tot = p_tot+x
+     return p_tot
+ 
+ def averaging(liste_data, name):
+     """
+     creates a json that contains all the means and standard deviations of the parameters
+     that we want to use to create the reference distribution with the averaging method
+     """
+     
+     curves=[]
+  
+     x = np.linspace(0,250,1000)
+ 
+     summary_data = {}
+     summary_data["measures"] = []
+     summary_data["means"] = []
+     summary_data["std"] = []
+     for data in  liste_data:
+         s=0
+         mean_tot=0
+         std_tot=0
+         with open(runDir + '/results/geometry/json/' + data + '.json') as json_data:
+             data_dict = json.load(json_data)
+ 
+         for i in range(len(data_dict['means'])):
+             mean = float(data_dict['means'][i])
+   
+             std = float(data_dict['std'][i])
+             
+             weight = float(data_dict['weights'][i])
+             y = weight*st.norm.pdf(x, mean, std)
+             mean_tot=mean_tot+(weight*mean)
+             std_tot=std_tot+(weight*std)
+           
+             curves.append(y)
+         summary_data["measures"].append(data)
+         summary_data["means"].append(str(mean_tot))
+         summary_data["std"].append(str(std_tot))
+  
+     with open(runDir + '/results/statistical_potential/json/Pyle/avg_' +name + ".json", 'w', encoding='utf-8') as f:
+         json.dump(summary_data, f, indent=4)
+ 
+ def potential_dist(data_obs, data_ref):
+     name=data_obs.split('/')[-1]
+     l=[]
+     for k in range(0, 400, 5):
+         obs=extract_from_json(data_obs, k, k+5)
+         ref=extract_from_json(data_ref, k, k+5)
+         if obs != 0.0 and ref != 0.0:
+             u = -log(obs/ref)
+             l.append([k, k+5, obs, ref, u])
+             l.append([k+5, k+5, obs, ref, u])
+ 
+         else:
+             l.append([k, k+5, obs, ref, None])
+             l.append([k+5, k+5, obs, ref, None])
+     df=pd.DataFrame(l, columns=["Xinf", "Xsup", "Pobs", "Pref", "- ln(Pobs/Pref)"])
+     df.to_csv(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + name + ".csv")
+ 
+ def courbe_stat_pot(f):
+     name=f.split('/')[-1]
+     name=name.split('.')[0]
+     df=pd.read_csv(f)
+     E=list(df["- ln(Pobs/Pref)"][~ np.isnan(df["- ln(Pobs/Pref)"])])
+     max_E=max(E)
+     min_E=min(E)
+     index_with_nan=df.index[df.iloc[:,5].isnull()]
+     for i in index_with_nan:
+         df.iloc[i, 5]=max_E # we replace the nan by the maximum found energy
+      
+ 
+     y=list(df["- ln(Pobs/Pref)"])
+     x=list(df["Xinf"])
+ 
+     x=np.array(x)
+     y=np.array(y)
+     plt.plot(x, y)
+     plt.xlabel("Distance (Angström)")
+     plt.ylabel("- ln(Pobs/Pref)")
+     plt.title(name)
+     plt.savefig(runDir + "/results/statistical_potential/figures/Pyle/avg_statistical_pot_ " + name + ".png")
+     plt.close()
+ 
+ def stat_potential_pyle():
+     pyle = ["P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"]
+     nt = ["A", "C", "G", "U"]
+     for pair in pyle:
+         type_list=[]
+         for res1 in nt:
+             for res2 in nt:
+                 setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2})")
+                 type_list.append(f"Distance {pair} between {res1}{res2}")
+                 averaging(type_list, f"{pair}")
+                 for dist in type_list:
+                     potential_dist(runDir + '/results/geometry/json/' + dist, runDir + f'/results/statistical_potential/json/Pyle/avg_{pair}')
+                     courbe_stat_pot(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + dist + '.csv')
+                     setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2}) finished")
+ 
+ 
+ @trace_unhandled_exceptions
 def measures_heavy_atoms(name, s, thr_idx):
     """
     create a list of all possible pairs of atoms among the 85 types of atoms
@@ -312,7 +392,7 @@ def measures_heavy_atoms(name, s, thr_idx):
     df_occur = pd.DataFrame(occurences, columns=["atom_pair_type"] + classes_str)
     # save this
     df_occur.to_csv(runDir + "/results/geometry/all-atoms/distances_classes/distances_classes_occur_" + name + ".csv")
- 
+ @trace_unhandled_exceptions
 def count_occur_atom_dist(fpath, outfilename):
     """
     After having calculated the number of occurrences of the distances between pairs of atoms
@@ -320,13 +400,9 @@ def count_occur_atom_dist(fpath, outfilename):
     we add these occurrences one by one to obtain a single dataframe with the total number of occurrences
     """
   
-     global idxQueue
-     thr_idx = idxQueue.get()
-     setproctitle(f"Worker {thr_idx+1} : Addition of occurences of {fpath}")    
- 
+     setproctitle(f"Addition of occurences of {fpath}")
     liste=os.listdir(fpath)
 
-     pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
     df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
 
     for f in range(len(liste)):
@@ -334,23 +410,20 @@ def count_occur_atom_dist(fpath, outfilename):
         for i in range(df.shape[0]):
             for j in range(2, df.shape[1]):
                 df_tot.iloc[i, j]=df_tot.iloc[i, j] + df.iloc[i, j]
-         pbar.update(1)
+         
     df_tot.to_csv(fpath+outfilename)
-     idxQueue.put(thr_idx) # replace the thread index in the queue
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
 
+     setproctitle(f"Addition of occurences of {fpath} finished")
+ 
+ @trace_unhandled_exceptions
 def mole_fraction(fpath, outfilename):
     """
     Calculation of the mole fraction of each type of atom within the set of structures
     """  
- 
-     global idxQueue
-     thr_idx = idxQueue.get()
-     setproctitle(f"Worker {thr_idx+1} : Calculation of mole fractions of {fpath}")    
- 
+     setproctitle(f"Calculation of mole fractions of {fpath}")
     liste=os.listdir(fpath)
     
-     pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
+    
     df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
     del df_tot["Unnamed: 0"]
 
@@ -359,31 +432,29 @@ def mole_fraction(fpath, outfilename):
         del df["Unnamed: 0"]
         for i in range(df.shape[0]):
                 df_tot.iloc[i, 1]=df_tot.iloc[i, 1] + df.iloc[i, 1]
-         pbar.update(1)
+ 
     total=sum(list(df_tot["count"]))
     fract=[]
     for i in range(df_tot.shape[0]):
-         # df_tot.iloc[i, 1]=df_tot.iloc[i, 1]/total
+        
         fract.append(df_tot.iloc[i, 1]/total)
     df_tot["mole_fraction"]=fract
-     file_list=os.listdir(fpath)
-     file_list=[fpath+x for x in file_list]
+     # file_list=os.listdir(fpath)
+     # file_list=[fpath+x for x in file_list]
     # for f in file_list: #after processing, deletion of csv by structure
     #     os.remove(f)
-     df_tot.to_csv(fpath+outfilename) #ou alors juste un return 
-     idxQueue.put(thr_idx) # replace the thread index in the queue
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+     df_tot.to_csv(fpath+outfilename) 
+ 
+     setproctitle(f"Calculation of mole fractions of {fpath} finished")
 
+ @trace_unhandled_exceptions
 def compute_ratio_from_csv(fpath, avg_file, qch_file):
     """
     Calculation of observed and reference probabilities 
     according to the methods chosen to establish the reference state
     Then calculation of the Pobs / Pref ratio
     """ 
-     # global idxQueue
-     # thr_idx = idxQueue.get()
-     # setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}")  
- 
+     setproctitle("Compute ratio from csv")
     df = pd.read_csv(fpath)
     del df['Unnamed: 0']
     del df['Unnamed: 0.1']
@@ -401,9 +472,8 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
     s_tot=sum(sommes)
     for s in sommes:
         pref_avg_list.append(s/s_tot)
-     print(sommes)
-     # return
-     # pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+avg_file, leave=False)
+    
+     
     df_bis=df.copy()
     # if method=averaging
     for i in range(df.shape[0]):
@@ -412,14 +482,7 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
             df_bis.iloc[i,j]=df.iloc[i,j]/(sum(df.iloc[i, k] for k in range(1, df.shape[1])))
             # ratio between the observed probability and the reference probability
             df_bis.iloc[i,j]=df_bis.iloc[i,j]/pref_avg_list[j-1]
-         # pbar.update(1)
-     # idxQueue.put(thr_idx) # replace the thread index in the queue
-     # setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
-     print(df_bis)
         
-     # thr_idx = idxQueue.get()
-     # setproctitle(f"Worker {thr_idx+1} : Compute ratio from {fpath}")
-     # pbar = tqdm(total=df.shape[0], position=thr_idx, desc="Preparing "+qch_file, leave=False)
 
     # if method=quasi-chemical
     df_ter=df.copy()
@@ -430,20 +493,18 @@ def compute_ratio_from_csv(fpath, avg_file, qch_file):
             # calculate the mole fractions of the atom corresponding to the pair in row i
             if atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[0] or atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[1]:
                 x.append(atom_count.at[k, 'mole_fraction'])
-                 # print(x)
+                 
         for j in range(1, df.shape[1]):
             if len(x)==2:
                 df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[1]*sommes[j-1]) # ratio for qchA method (Nijobs(r)/xi*xj*Nobs(r))
             if len(x)==1:
                 df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[0]*sommes[j-1]) 
-         # pbar.update(1)
-     # idxQueue.put(thr_idx) # replace the thread index in the queue
-     # setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
-     print(df_ter)
-     # ajouter ce qui est dans la fct save_into_database ici?
-     df_bis.to_csv(runDir + '/results/statistical_potential/all-atoms/' + avg_file)
-     df_ter.to_csv(runDir + '/results/statistical_potential/all-atoms/' + qch_file)
+     df_bis.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + avg_file)
+     df_ter.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + qch_file)
 
+     setproctitle("Compute ratio from csv finished")
+     
+ @trace_unhandled_exceptions
 def sql_new_table(conn):
     cur = conn.cursor()
     sql = """ CREATE TABLE IF NOT EXISTS all_atoms (
@@ -482,10 +543,11 @@ def sql_new_table(conn):
 
     conn.execute("pragma journal_mode=wal")
     #conn.close()
- 
+ @trace_unhandled_exceptions
 def save_into_database():
-     df_avg = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/avg_ratio_pobs_pref.csv')
-     df_qch = pd.read_csv(runDir + '/results/statistical_potential/all-atoms/qch_ratio_pobs_pref.csv')
+     setproctitle("Saving statistical potentials(avg, qch) into database")
+     df_avg = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/avg_ratio_pobs_pref.csv')
+     df_qch = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/qch_ratio_pobs_pref.csv')
     ratio_list=[]
     del df_avg['Unnamed: 0']
     del df_qch['Unnamed: 0']
@@ -495,7 +557,7 @@ def save_into_database():
             ratio_list.append([df_avg.iloc[i,0], df_avg.columns[j], df_avg.iloc[i, j], df_qch.iloc[i,j]])
 
     
-     with sqlite3.connect(runDir + "/results/Stats.db", timeout=20.0) as conn:
+     with sqlite3.connect(runDir + "/results/Stat_potential.db", timeout=20.0) as conn:
         conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
         # We use the REPLACE keyword to get the latest information
         sql_execute(conn, """INSERT OR REPLACE INTO all_atoms (atom_pair, distance_bin, avg_ratio_pobs_pref, qch_ratio_pobs_pref )
@@ -503,12 +565,15 @@ def save_into_database():
                     many=True, 
                     data=ratio_list
                     ) 
- 
- def stat_potential(pair, f, method):
+     setproctitle("Saving statistical potentials(avg, qch) into database finished") 
+ @trace_unhandled_exceptions
+ def stat_potential(f, method):
     df=pd.read_csv(f)
     del df['Unnamed: 0']
     df.set_index('atom_pair_type', inplace=True)
     df=df.T
+     setproctitle(f"RNANet statistics.py stat_potential({method})")
+     for pair in df.columns:
         c=df[pair].tolist()
         new=[]
         for x in c:
@@ -528,8 +593,6 @@ def stat_potential(pair, f, method):
             abs.append(i+5)
         abs.append(150)
         abs.append(300)
-     print(new)
-     print(abs)
 
         x=abs
         y=new
@@ -539,36 +602,12 @@ def stat_potential(pair, f, method):
         plt.xlabel('Distance')
         plt.ylabel("- ln(Pobs/Pref)")
         plt.title(f'Statistical potential of {pair} distance ({method} method)')
-     plt.savefig(f"/home/atabot/RNANet/results/statistical_potential/all-atoms/{method}_statistical_pot_{pair}.png")
+         plt.savefig(runDir + f"/results/statistical_potential/figures/all-atoms/{method}_method/{method}_statistical_pot_{pair}.png")
         plt.close()
 
- # mettre en option le choix de la méthode?
+     setproctitle(f"RNANet statistics.py stat_potential({method}) finished")
+ 
 
 
 if __name__ == "__main__":
-     os.makedirs(runDir + '/results/statistical_potential/all-atoms/', exist_ok=True)
-     # compute_ratio_from_csv('/home/atabot/RNANet/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv')
-     # exit(1)
-     with sqlite3.connect(runDir + '/results/Stats.db') as conn:
-         sql_new_table(conn)
-     save_into_database()
-     exit(1)
- 
-     # count_occur_atom_dist(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv')
-     # os.makedirs(runDir + '/results/figures/all-atoms/distances/hist/', exist_ok=True)
-     # histo_occur(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv')
-     # exit(1)
-     mole_fraction(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv')
-     exit(1)
-     # print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
-     if n_unmapped_chains:
-     #     os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
-         os.makedirs(runDir+"/results/geometry/all-atoms/distances_classes/", exist_ok=True)
-         os.makedirs(runDir+"/results/geometry/all-atoms/atom_count/", exist_ok=True)
-     #    liste_struct = os.listdir(path_to_3D_data + "rna_only")
-         liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')[:100]
-         # measure_from_structure(liste_struct[0])
-         # exit(1)
-         #measure_from_structure('5e81_1_2K.cif')
-         #exit(1)
-     # concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')
\ No newline at end of file
+     print("This file is not supposed to be run directly. Run statistics.py instead.")
--- a/statistics.py
View file @e184384
+++ b/statistics.py
View file @e184384
@@ -38,6 +38,7 @@ LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam 
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
 
 from geometric_stats import *   # after definition of the variables
+ from statistical_potential import *
 
 @trace_unhandled_exceptions
 def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
@@ -1181,13 +1182,16 @@ def measure_from_structure(f):
         s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
     
     #pyle_measures(name, s, thr_idx)
-     measures_aa(name, s, thr_idx)
+     # measures_aa(name, s, thr_idx)
     if DO_HIRE_RNA_MEASURES:
         measures_hrna(name, s, thr_idx)
         measures_hrna_basepairs(name, s, path_to_3D_data, thr_idx)
     if DO_WADLEY_ANALYSIS:
         measures_pyle(name, s, thr_idx)
-     
+     if DO_STAT_POTENTIAL_MEASURES:
+         # measures_heavy_atoms(name, s, thr_idx)
+         if DO_WADLEY_ANALYSIS:
+             pyle_measures_for_potentials(name, s, thr_idx)
     idxQueue.put(thr_idx) # replace the thread index in the queue
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
 
@@ -1335,10 +1339,11 @@ if __name__ == "__main__":
     REDUNDANT_DIST_MAT = True
     DO_HIRE_RNA_MEASURES = False
     RESCAN_GMM_COMP_NUM = False
+     DO_STAT_POTENTIAL_MEASURES = False
     try:
         opts, _ = getopt.getopt( sys.argv[1:], "r:h", 
                     [ "help", "from-scratch", "wadley", "distance-matrices", "non-redundant", "resolution=", 
-                       "3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes" ])
+                       "3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes", "stat-potential" ])
     except getopt.GetoptError as err:
         print(err)
         sys.exit(2)
@@ -1364,6 +1369,7 @@ if __name__ == "__main__":
             print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
             print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model,\n\t\t\t\t  and plot GMMs on the data.")
             print("--rescan-nmodes\t\t\tDo not assume the number of modes in distances and angles distributions, measure it.")
+             print("--stat-potential\t\t\tCompute statistical potentials based on averaging and quasi-chemical approximation methods.")
             sys.exit()
         elif opt == "--version":
             print("RNANet statistics 1.6 beta")
@@ -1407,7 +1413,14 @@ if __name__ == "__main__":
             RESCAN_GMM_COMP_NUM = True
         elif opt == "--non-redundant":
             REDUNDANT_DIST_MAT = False
- 
+         elif opt == "--stat-potential":
+             DO_STAT_POTENTIAL_MEASURES = True
+             os.makedirs(runDir + "/results/geometry/Pyle/distances_i_i+1/", exist_ok=True)
+             os.makedirs(runDir + "/results/geometry/all-atoms/distances_classes/", exist_ok=True)
+             os.makedirs(runDir + "/results/geometry/all-atoms/atom_count/", exist_ok=True)
+             os.makedirs(runDir + "/results/statistical_potential/ratio/Pyle/", exist_ok=True)
+             os.makedirs(runDir + "/results/statistical_potential/ratio/all-atoms/", exist_ok=True)
+             os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/", exist_ok=True)
     # Load mappings. famlist will contain only families with structures at this resolution threshold.
     
     print("Loading mappings list...")
@@ -1420,19 +1433,19 @@ if __name__ == "__main__":
                                     WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd'
                                     GROUP BY rfam_acc;
                                 """, conn)
-     families.drop(families[families.n_chains == 0].index, inplace=True)
-     mappings_list = {}
-     for k in families.rfam_acc:
-         mappings_list[k] = [ x[0] for x in sql_ask_database(conn,  f"""SELECT chain_id 
-                                                                         FROM chain JOIN structure ON chain.structure_id=structure.pdb_id 
-                                                                         WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
-     famlist = families.rfam_acc.tolist()
-     ignored = families[families.n_chains < 3].rfam_acc.tolist()
-     famlist.sort(key=family_order)
- 
-     print(f"Found {len(famlist)} families with chains or better.")
-     if len(ignored):
-         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
+     # families.drop(families[families.n_chains == 0].index, inplace=True)
+     # mappings_list = {}
+     # for k in families.rfam_acc:
+     #     mappings_list[k] = [ x[0] for x in sql_ask_database(conn,  f"""SELECT chain_id 
+     #                                                                     FROM chain JOIN structure ON chain.structure_id=structure.pdb_id 
+     #                                                                     WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
+     # famlist = families.rfam_acc.tolist()
+     # ignored = families[families.n_chains < 3].rfam_acc.tolist()
+     # famlist.sort(key=family_order)
+ 
+     # print(f"Found {len(famlist)} families with chains or better.")
+     # if len(ignored):
+         # print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
     
     if DELETE_OLD_DATA:
         for f in famlist:
@@ -1456,9 +1469,9 @@ if __name__ == "__main__":
     joblist = []
 
     # Do eta/theta plots
-     if n_unmapped_chains and DO_WADLEY_ANALYSIS:    
-        joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
-        joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
+     # if n_unmapped_chains and DO_WADLEY_ANALYSIS:    
+     #    joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
+     #    joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
 
     # Do distance matrices for each family excl. LSU/SSU (will be processed later)
     if DO_AVG_DISTANCE_MATRIX:  
@@ -1474,23 +1487,23 @@ if __name__ == "__main__":
             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, False, REDUNDANT_DIST_MAT, False)))
 
     # Do general family statistics
-     joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
-     joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
-     for f in famlist:
-         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
-         if f not in ignored:
-             joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
+     # joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
+     # joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
+     # for f in famlist:
+     #     joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
+     #     if f not in ignored:
+             # joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
     
     
     # Do geometric measures
     if n_unmapped_chains:
         os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True)
         structure_list = representatives_from_nrlist(res_thr)
-         for f in structure_list:
+         for f in structure_list[:10]:
             if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
                 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers))   # All-atom distances
     
-     process_jobs(joblist)
+     # process_jobs(joblist)
 
     # Now process the memory-heavy tasks family by family
     if DO_AVG_DISTANCE_MATRIX:
@@ -1511,31 +1524,55 @@ if __name__ == "__main__":
     
     # finish the work after the parallel portions
     
-     per_chain_stats()   # per chain base frequencies and basepair types
-     seq_idty()          # identity matrices from pre-computed .npy matrices
-     stats_pairs()
+     # per_chain_stats()   # per chain base frequencies and basepair types
+     # seq_idty()          # identity matrices from pre-computed .npy matrices
+     # stats_pairs()
     if n_unmapped_chains:
-         general_stats()
+         # general_stats()
         os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
         os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
-         concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers)
+         # concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers)
         if DO_HIRE_RNA_MEASURES:
             concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv', nworkers)
             concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv', nworkers)
             concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv', nworkers)
             concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv', nworkers)
-         if DO_WADLEY_ANALYSIS:
-             concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers)
-             concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers)
+         # if DO_WADLEY_ANALYSIS:
+         #     concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers)
+         #     concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers)
+         # if DO_STAT_POTENTIAL_MEASURES:
+             # with sqlite3.connect(runDir + '/results/Stat_potential.db') as conn:
+             #     sql_new_table(conn)
+             # joblist=[]
+             # joblist.append(Job(function=count_occur_atom_dist, args=(runDir + '/results/geometry/all-atoms/distances_classes/', 'occur_dist_classes.csv')))
+             # joblist.append(Job(function=mole_fraction, args=(runDir + '/results/geometry/all-atoms/atom_count/', 'atom_count.csv')))
+             # process_jobs(joblist)
+             # compute_ratio_from_csv(runDir + '/results/geometry/all-atoms/distances_classes/occur_dist_classes.csv', 'avg_ratio_pobs_pref.csv', 'qch_ratio_pobs_pref.csv')
+             # if DO_WADLEY_ANALYSIS:
+             #     concat_dataframes(runDir + '/results/geometry/Pyle/distances_i_i+1/', 'distances.csv', nworkers)
         joblist = []
-         joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,)))
-         joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr)))
+         # joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,)))
+         # joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr)))
         if DO_HIRE_RNA_MEASURES:
             joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM,)))
             joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM,)))
+         # if DO_WADLEY_ANALYSIS:
+         #     joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr)))
+             
+         if DO_STAT_POTENTIAL_MEASURES:
+             methods = ['avg', 'qch']
+             os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/avg_method/", exist_ok=True)
+             os.makedirs(runDir + "/results/statistical_potential/figures/all-atoms/qch_method/", exist_ok=True)
+             # save_into_database()
+             # for method in methods:
+             #     joblist.append(Job(function=stat_potential, args=(runDir + '/results/statistical_potential/ratio/all-atoms/' + method + '_ratio_pobs_pref.csv', method)))
             if DO_WADLEY_ANALYSIS:
-             joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr)))
-         process_jobs(joblist)
-         merge_jsons(DO_HIRE_RNA_MEASURES)
+                 os.makedirs(runDir + "/results/statistical_potential/figures/Pyle/", exist_ok=True)
+                 os.makedirs(runDir + "/results/statistical_potential/json/Pyle/", exist_ok=True)
+                 # joblist.append(Job(function=gmm_pyle_per_type, args=(RESCAN_GMM_COMP_NUM,)))
+                 # gmm_pyle_per_type(RESCAN_GMM_COMP_NUM)
+                 stat_potential_pyle()
+         # process_jobs(joblist)
+         # merge_jsons(DO_HIRE_RNA_MEASURES)