Aglaé TABOT

Added useful functions for measurements and plots on pairings

...@@ -30,6 +30,8 @@ from collections import Counter ...@@ -30,6 +30,8 @@ from collections import Counter
30 from setproctitle import setproctitle 30 from setproctitle import setproctitle
31 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions 31 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
32 from sklearn.mixture import GaussianMixture 32 from sklearn.mixture import GaussianMixture
33 +import warnings
34 +from pandas.core.common import SettingWithCopyWarning
33 35
34 36
35 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) 37 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
...@@ -1921,7 +1923,7 @@ def angles_plans_hire_RNA(f): ...@@ -1921,7 +1923,7 @@ def angles_plans_hire_RNA(f):
1921 1923
1922 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})") 1924 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})")
1923 1925
1924 - os.makedirs(runDir+"/results/plane_angles_hRNA/", exist_ok=True) 1926 + os.makedirs(runDir+"/results/HiRE-RNA/angles/", exist_ok=True)
1925 1927
1926 parser=MMCIFParser() 1928 parser=MMCIFParser()
1927 s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f)) 1929 s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
...@@ -1960,7 +1962,7 @@ def angles_plans_hire_RNA(f): ...@@ -1960,7 +1962,7 @@ def angles_plans_hire_RNA(f):
1960 pbar.close() 1962 pbar.close()
1961 1963
1962 1964
1963 - df.to_csv(runDir + '/results/plane_angles_hRNA/' + 'angles_plans_hire_RNA '+name+'.csv') 1965 + df.to_csv(runDir + '/results/HiRE-RNA/angles/' + 'angles_plans_hire_RNA '+name+'.csv')
1964 idxQueue.put(thr_idx) # replace the thread index in the queue 1966 idxQueue.put(thr_idx) # replace the thread index in the queue
1965 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 1967 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1966 1968
...@@ -2040,6 +2042,71 @@ def GMM_histo(data, name_data, x, y, nb_fichiers) : ...@@ -2040,6 +2042,71 @@ def GMM_histo(data, name_data, x, y, nb_fichiers) :
2040 with open (name_data + " .json", 'w', encoding='utf-8') as f: 2042 with open (name_data + " .json", 'w', encoding='utf-8') as f:
2041 json.dump(summary_data, f, indent=4) 2043 json.dump(summary_data, f, indent=4)
2042 2044
2045 +def GMM_histo_without_saving(data, name_data, x, y, nb_fichiers) :
2046 + '''
2047 + Plot Gaussian-Mixture-Model on histograms
2048 + '''
2049 + histogram(data, name_data, x, y, nb_fichiers)#plot the histogram
2050 +
2051 + n_max = 8 # number of possible values for n_components
2052 + n_components_range = np.arange(n_max)+1
2053 + aic = []
2054 + bic = []
2055 + maxlogv=[]
2056 + md=np.array(data).reshape(-1,1)
2057 + # construction of models and calculation of criteria
2058 + nb_components=1
2059 + nb_log_max=n_components_range[0]
2060 + log_max=0
2061 + # chooses the number of components based on the maximum likelihood value (maxlogv)
2062 + for n_comp in n_components_range:
2063 + gmm = GaussianMixture(n_components=n_comp).fit(md)
2064 + aic.append(abs(gmm.aic(md)))
2065 + bic.append(abs(gmm.bic(md)))
2066 + maxlogv.append(gmm.lower_bound_)
2067 + if gmm.lower_bound_== max(maxlogv) : # takes the maximum
2068 + nb_components=n_comp
2069 + # if there is convergence, keep the first maximum found
2070 + if abs(gmm.lower_bound_-log_max)<0.02 : #threshold=0.02
2071 + nb_components=nb_log_max
2072 + break
2073 + log_max=max(maxlogv)
2074 + nb_log_max=n_comp
2075 +
2076 + # plot with the appropriate number of components
2077 + obs=np.array(data).reshape(-1,1)
2078 + g = GaussianMixture(n_components=nb_components)
2079 + g.fit(obs)
2080 + weights = g.weights_
2081 + means = g.means_
2082 + covariances = g.covariances_
2083 +
2084 + D = obs.ravel()
2085 + xmin = D.min()
2086 + xmax = D.max()
2087 + x = np.linspace(xmin,xmax,1000)
2088 + colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
2089 + # prepare the dictionary to save the parameters
2090 + summary_data={}
2091 + summary_data["measure"]= name_data
2092 + summary_data["weights"]=[]
2093 + summary_data["means"]=[]
2094 + summary_data["std"]=[]
2095 + # plot
2096 + for i in range(nb_components):
2097 + mean = means[i]
2098 + sigma = np.sqrt(covariances[i])
2099 + weight = weights[i]
2100 + plt.plot(x,(weights[i]*st.norm.pdf(x,mean,sigma))[0], c=colors[i])
2101 + summary_data["means"].append(str(mean))
2102 + summary_data["std"].append(str(sigma))
2103 + summary_data["weights"].append(str(weight))
2104 + axes=plt.gca()
2105 + plt.title("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(nb_fichiers)+" structures)")
2106 + # save in a json
2107 + with open (name_data + " .json", 'w', encoding='utf-8') as f:
2108 + json.dump(summary_data, f, indent=4)
2109 +
2043 def GMM_tot(data, name_data, nb_fichiers, couleur) : 2110 def GMM_tot(data, name_data, nb_fichiers, couleur) :
2044 ''' 2111 '''
2045 Plot the sum of the Gaussians (without the histograms) 2112 Plot the sum of the Gaussians (without the histograms)
...@@ -2468,7 +2535,7 @@ def graph_torsion_h_RNA(): ...@@ -2468,7 +2535,7 @@ def graph_torsion_h_RNA():
2468 2535
2469 def graph_plans_h_RNA(): 2536 def graph_plans_h_RNA():
2470 2537
2471 - df=pd.read_csv(os.path.abspath(runDir + "results/HiRE-RNA/angles/angles_plans_hire_RNA.csv")) 2538 + df=pd.read_csv(os.path.abspath(runDir + "/results/HiRE-RNA/angles/angles_plans_hire_RNA.csv"))
2472 2539
2473 p_c1p_psuiv=list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])]) 2540 p_c1p_psuiv=list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])])
2474 c1p_psuiv_c1psuiv=list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])]) 2541 c1p_psuiv_c1psuiv=list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])])
...@@ -2487,6 +2554,112 @@ def graph_plans_h_RNA(): ...@@ -2487,6 +2554,112 @@ def graph_plans_h_RNA():
2487 plt.savefig("GMM des angles plans (hire-RNA) (100 structures).png") 2554 plt.savefig("GMM des angles plans (hire-RNA) (100 structures).png")
2488 plt.close() 2555 plt.close()
2489 2556
2557 +'''
2558 +Functions for making measurements on pairings
2559 +'''
2560 +
2561 +def dictio(ld):
2562 + '''
2563 + creation of a dictionary
2564 + key = pdb identifier of the structure
2565 + value = list of RNA chains
2566 + '''
2567 + pdb=[]
2568 + for f in ld:
2569 + pdb_id=str.split(f, '_')[0]
2570 + if pdb_id not in pdb: #we create a list of distinct structures
2571 + pdb.append(pdb_id)
2572 + for pdb_id in pdb:#for all structures found
2573 + liste_chaines=[]
2574 + for f in ld:
2575 + if (len(f)<10): #unmapped to a Rfam family
2576 + chaine=str.split(f, '_')
2577 + if chaine[0]==pdb_id:
2578 + id_chain=chaine[2]
2579 + liste_chaines.append(id_chain)
2580 + if liste_chaines != []:
2581 + dictionnaire[pdb_id]=liste_chaines
2582 + return (dictionnaire)
2583 +
2584 +def dist_pointes(res, pair):
2585 + '''
2586 + measure of the distance between the tips of the paired nucleotides (B1 / B1 or B1 / B2 or B2 / B2)
2587 + '''
2588 + dist=[]
2589 + d=0
2590 + if res.get_resname()=='A' or res.get_resname()=='G' :# different cases if 1 aromatic cycle or 2
2591 + atom_res=pos_b2(res)
2592 + if pair.get_resname()=='A' or pair.get_resname()=='G' :
2593 + atom_pair=pos_b2(pair)
2594 + if pair.get_resname()=='C' or pair.get_resname()=='U' :
2595 + atom_pair=pos_b1(pair)
2596 +
2597 + if res.get_resname()=='C' or res.get_resname()=='U' :
2598 + atom_res=pos_b1(res)
2599 + if pair.get_resname()=='A' or pair.get_resname()=='G' :
2600 + atom_pair=pos_b2(pair)
2601 + if pair.get_resname()=='C' or pair.get_resname()=='U' :
2602 + atom_pair=pos_b1(pair)
2603 +
2604 + dist=distance(atom_res, atom_pair)
2605 +
2606 + return dist
2607 +
2608 +def angle_c1_b1(res,pair):
2609 + '''
2610 + measurement of the plane angles formed by the vectors C1-> B1 of the paired nucleotides
2611 + '''
2612 + if res.get_resname()=='A' or res.get_resname()=='G' or res.get_resname()=='C' or res.get_resname()=='U' :
2613 + atom_c4_res = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
2614 + atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
2615 + atom_b1_res=pos_b1(res)
2616 + c4_res=Vector(atom_c4_res[0])
2617 + c1_res=Vector(atom_c1p_res[0])
2618 + b1_res=Vector(atom_b1_res)
2619 + if pair.get_resname()=='A' or pair.get_resname()=='G' or pair.get_resname()=='C' or pair.get_resname()=='U' :
2620 + atom_c4_pair = [ atom.get_coord() for atom in pair if "C4'" in atom.get_fullname() ]
2621 + atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ]
2622 + atom_b1_pair=pos_b1(pair)
2623 + c4_pair=Vector(atom_c4_pair[0])
2624 + c1_pair=Vector(atom_c1p_pair[0])
2625 + b1_pair=Vector(atom_b1_pair)
2626 + #we calculate the 4 plane angles including these vectors
2627 +
2628 + a=calc_angle(c4_res, c1_res, b1_res)*(180/np.pi)
2629 + b=calc_angle(c1_res, b1_res, b1_pair)*(180/np.pi)
2630 + c=calc_angle(b1_res, b1_pair, c1_pair)*(180/np.pi)
2631 + d=calc_angle(b1_pair, c1_pair, c4_pair)*(180/np.pi)
2632 +
2633 +
2634 + angles=[a, b, c, d]
2635 + return angles
2636 +
2637 +def graphe(type_LW, angle_1, angle_2, angle_3, angle_4, distance, nb):
2638 + '''
2639 + function to plot the statistical figures you want
2640 + By type of pairing:
2641 + Superposition of GMMs of plane angles
2642 + Superposition of the histogram and the GMM of the distances
2643 + all in the same window
2644 + '''
2645 +
2646 + figure = plt.figure(figsize = (10, 10))
2647 + plt.gcf().subplots_adjust(left = 0.1, bottom = 0.1, right = 0.9, top = 0.9, wspace = 0, hspace = 0.5)
2648 +
2649 + plt.subplot(2, 1, 1)
2650 +
2651 + GMM_tot(angle_1, "C4'-C1'-B1", nb, 'cyan' )
2652 + GMM_tot(angle_2, "C1'-B1-B1pair", nb, 'magenta')
2653 + GMM_tot(angle_3, "B1-B1pair-C1'pair", nb, "yellow")
2654 + GMM_tot(angle_4, "B1pair-C1'pair-C4'pair", nb, 'olive')
2655 + plt.xlabel("Angle(degré)")
2656 + plt.title("GMM des angles plans pour les appariements " +type_LW +" (" +str(nb)+ " valeurs)", fontsize=10)
2657 +
2658 + plt.subplot(2, 1, 2)
2659 + GMM_histo_without_saving(distance, "Distance pointes " + type_LW, "Distance (Angström)", "Densité", nb)
2660 +
2661 + plt.savefig("Mesures appariements " +type_LW+ " (" +str(nb)+ " valeurs).png" )
2662 + plt.close()
2490 2663
2491 2664
2492 2665
...@@ -2623,12 +2796,13 @@ if __name__ == "__main__": ...@@ -2623,12 +2796,13 @@ if __name__ == "__main__":
2623 #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0]) 2796 #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
2624 #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv') 2797 #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
2625 2798
2626 - #exit() 2799 +
2800 + exit()
2627 f_prec=os.listdir(path_to_3D_data + "rna_only")[0] 2801 f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
2628 for f in os.listdir(path_to_3D_data + "rna_only")[:100]: 2802 for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
2629 #joblist.append(Job(function=dist_atoms, args=(f,))) 2803 #joblist.append(Job(function=dist_atoms, args=(f,)))
2630 #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,))) 2804 #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
2631 - #joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,))) 2805 + joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,)))
2632 joblist.append(Job(function=angles_plans_hire_RNA, args=(f,))) 2806 joblist.append(Job(function=angles_plans_hire_RNA, args=(f,)))
2633 2807
2634 2808
...@@ -2676,7 +2850,11 @@ if __name__ == "__main__": ...@@ -2676,7 +2850,11 @@ if __name__ == "__main__":
2676 graph_angles_torsion() 2850 graph_angles_torsion()
2677 concatenate('/results/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv') 2851 concatenate('/results/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')
2678 graph_dist_atoms_h_RNA() 2852 graph_dist_atoms_h_RNA()
2853 +
2679 concatenate('/results/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv') 2854 concatenate('/results/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv')
2680 - graph_torsion_hire_RNA() 2855 + concatenate('/results/HiRE-RNA/angles/', 'angles_plans_hire_RNA.csv')
2856 + graph_torsion_h_RNA()
2857 + graph_plans_h_RNA()
2858 +
2681 graph_eta_theta() 2859 graph_eta_theta()
2682 ''' 2860 '''
......