New functions : angles_plans_hire_RNA, histogram, GMM_histo

Aglaé TABOT
Commit 0165166d17f63aece4018a2a9d8183deb4fe875e 0165166d 1 parent ab43d9d6
Showing 1 changed file with 138 additions and 3 deletions
statistics.py
--- a/statistics.py
View file @0165166
+++ b/statistics.py
View file @0165166
@@ -1899,6 +1899,140 @@ def angles_torsion_hire_RNA(f):
     idxQueue.put(thr_idx) # replace the thread index in the queue
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
 
+ def angles_plans_hire_RNA(f):
+     '''
+     Measures the plane angles involving C1' and B1 atoms 
+     Saves the results in a dataframe
+     '''
+     name=str.split(f,'.')[0]
+     liste_angles_plans=[]
+     last_p=[]
+     last_c1p=[]
+ 
+     global idxQueue
+     thr_idx = idxQueue.get()
+ 
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})")
+ 
+     os.makedirs(runDir+"/results/plane_angles_hRNA/", exist_ok=True)
+ 
+     parser=MMCIFParser()
+     s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
+     chain = next(s[0].get_chains())
+     residues=list(chain.get_residues())
+     pbar = tqdm(total=len(residues), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} angles_torsion_hire_RNA", unit="residu", leave=False)
+     pbar.update(0)
+     for res in chain :
+         p_c1p_psuiv=np.nan
+         c1p_psuiv_c1psuiv=np.nan
+         if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] :
+             atom_p = [ atom.get_coord() for atom in res if atom.get_name() ==  "P"]
+             atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
+ 
+             if len(last_p)<1 or len(last_c1p)<1 or len(atom_p)<1 :
+                 p_c1p_psuiv=p_c1p_psuiv
+             else :
+                 p_prec=Vector(last_p[0])
+                 c1p_prec=Vector(last_c1p[0])
+                 p=Vector(atom_p[0])
+                 p_c1p_psuiv=calc_angle(p_prec, c1p_prec, p)*(180/np.pi)
+ 
+             if len(atom_c1p)<1 or len(last_c1p)<1 or len(atom_p)<1:
+                 c1p_psuiv_c1psuiv=c1p_psuiv_c1psuiv
+             else :
+                 c1p_prec=Vector(last_c1p[0])
+                 p=Vector(atom_p[0])
+                 c1p=Vector(atom_c1p[0])
+                 c1p_psuiv_c1psuiv=calc_angle(c1p_prec, p, c1p)*(180/np.pi)
+ 
+             last_p=atom_p
+             last_c1p=atom_c1p
+             liste_angles_plans.append([res.get_resname(), p_c1p_psuiv, c1p_psuiv_c1psuiv])
+             pbar.update(1)
+     df=pd.DataFrame(liste_angles_plans, columns=["Residu", "P-C1'-P°", "C1'-P°-C1'°"])
+     pbar.close()
+     
+     
+     df.to_csv(runDir + '/results/plane_angles_hRNA/' + 'angles_plans_hire_RNA '+name+'.csv')
+     idxQueue.put(thr_idx) # replace the thread index in the queue
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+     
+ def histogram(data, name_data, x, y, nb):
+     '''
+     Plot histograms
+     '''
+     
+     plt.hist(data,color="green",edgecolor='black', linewidth=1.2,bins=50, density=True)
+     plt.xlabel(x)
+     plt.ylabel(y)
+ 
+ def GMM_histo(data, name_data, x, y, nb_fichiers) :
+     '''
+     Plot Gaussian-Mixture-Model on histograms
+     '''
+     histogramme(data, name_data, x, y, nb_fichiers)#plot the histogram
+     
+     n_max = 8    # number of possible values for n_components
+     n_components_range = np.arange(n_max)+1
+     aic = []
+     bic = []
+     maxlogv=[]
+     md=np.array(data).reshape(-1,1)
+     # construction of models and calculation of criteria
+     nb_components=1
+     nb_log_max=n_components_range[0]
+     log_max=0
+     # chooses the number of components based on the maximum likelihood value (maxlogv)
+     for n_comp in n_components_range:
+         gmm = GaussianMixture(n_components=n_comp).fit(md)
+         aic.append(abs(gmm.aic(md)))
+         bic.append(abs(gmm.bic(md)))
+         maxlogv.append(gmm.lower_bound_)
+         if gmm.lower_bound_== max(maxlogv) : # takes the maximum
+             nb_components=n_comp
+             # if there is convergence, keep the first maximum found
+             if abs(gmm.lower_bound_-log_max)<0.02 : #threshold=0.02
+                 nb_components=nb_log_max
+                 break
+         log_max=max(maxlogv)
+         nb_log_max=n_comp
+ 
+     # plot with the appropriate number of components
+     obs=np.array(data).reshape(-1,1)
+     g = GaussianMixture(n_components=nb_components)
+     g.fit(obs)
+     weights = g.weights_
+     means = g.means_
+     covariances = g.covariances_
+ 
+     D = obs.ravel()
+     xmin = D.min()
+     xmax = D.max()
+     x = np.linspace(xmin,xmax,1000)
+     colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
+     # prepare the dictionary to save the parameters
+     summary_data={}
+     summary_data["measure"]= name_data
+     summary_data["weights"]=[]
+     summary_data["means"]=[]
+     summary_data["std"]=[]
+     # plot
+     for i in range(nb_components):
+         mean = means[i]
+         sigma = math.sqrt(covariances[i])
+         weight = weights[i]
+         plt.plot(x,weights[i]*stats.norm.pdf(x,mean,sigma), c=colors[i])
+         summary_data["means"].append(str(mean))
+         summary_data["std"].append(str(sigma))
+         summary_data["weights"].append(str(weight))
+     axes=plt.gca()
+     plt.title("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(nb_fichiers)+" structures)")
+ 
+     # save in a json
+     with open (name_data +" "+str(nb_fichiers)+ " .json", 'w', encoding='utf-8') as f:
+ 	    json.dump(summary_data, f, indent=4)
+ 
+ 
 
 if __name__ == "__main__":
 
@@ -2037,9 +2171,10 @@ if __name__ == "__main__":
     #exit()
     f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
     for f in os.listdir(path_to_3D_data + "rna_only")[:100]: 
-         joblist.append(Job(function=dist_atoms, args=(f,)))
-         joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
-         joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,)))
+         #joblist.append(Job(function=dist_atoms, args=(f,)))
+         #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
+         #joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,)))
+         joblist.append(Job(function=angles_plans_hire_RNA, args=(f,)))
     
     
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)