Added useful functions for measurements and plots on pairings
Showing
1 changed file
with
184 additions
and
6 deletions
... | @@ -30,6 +30,8 @@ from collections import Counter | ... | @@ -30,6 +30,8 @@ from collections import Counter |
30 | from setproctitle import setproctitle | 30 | from setproctitle import setproctitle |
31 | from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions | 31 | from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions |
32 | from sklearn.mixture import GaussianMixture | 32 | from sklearn.mixture import GaussianMixture |
33 | +import warnings | ||
34 | +from pandas.core.common import SettingWithCopyWarning | ||
33 | 35 | ||
34 | 36 | ||
35 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) | 37 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) |
... | @@ -1921,7 +1923,7 @@ def angles_plans_hire_RNA(f): | ... | @@ -1921,7 +1923,7 @@ def angles_plans_hire_RNA(f): |
1921 | 1923 | ||
1922 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})") | 1924 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})") |
1923 | 1925 | ||
1924 | - os.makedirs(runDir+"/results/plane_angles_hRNA/", exist_ok=True) | 1926 | + os.makedirs(runDir+"/results/HiRE-RNA/angles/", exist_ok=True) |
1925 | 1927 | ||
1926 | parser=MMCIFParser() | 1928 | parser=MMCIFParser() |
1927 | s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f)) | 1929 | s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f)) |
... | @@ -1960,7 +1962,7 @@ def angles_plans_hire_RNA(f): | ... | @@ -1960,7 +1962,7 @@ def angles_plans_hire_RNA(f): |
1960 | pbar.close() | 1962 | pbar.close() |
1961 | 1963 | ||
1962 | 1964 | ||
1963 | - df.to_csv(runDir + '/results/plane_angles_hRNA/' + 'angles_plans_hire_RNA '+name+'.csv') | 1965 | + df.to_csv(runDir + '/results/HiRE-RNA/angles/' + 'angles_plans_hire_RNA '+name+'.csv') |
1964 | idxQueue.put(thr_idx) # replace the thread index in the queue | 1966 | idxQueue.put(thr_idx) # replace the thread index in the queue |
1965 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | 1967 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") |
1966 | 1968 | ||
... | @@ -2040,6 +2042,71 @@ def GMM_histo(data, name_data, x, y, nb_fichiers) : | ... | @@ -2040,6 +2042,71 @@ def GMM_histo(data, name_data, x, y, nb_fichiers) : |
2040 | with open (name_data + " .json", 'w', encoding='utf-8') as f: | 2042 | with open (name_data + " .json", 'w', encoding='utf-8') as f: |
2041 | json.dump(summary_data, f, indent=4) | 2043 | json.dump(summary_data, f, indent=4) |
2042 | 2044 | ||
2045 | +def GMM_histo_without_saving(data, name_data, x, y, nb_fichiers) : | ||
2046 | + ''' | ||
2047 | + Plot Gaussian-Mixture-Model on histograms | ||
2048 | + ''' | ||
2049 | + histogram(data, name_data, x, y, nb_fichiers)#plot the histogram | ||
2050 | + | ||
2051 | + n_max = 8 # number of possible values for n_components | ||
2052 | + n_components_range = np.arange(n_max)+1 | ||
2053 | + aic = [] | ||
2054 | + bic = [] | ||
2055 | + maxlogv=[] | ||
2056 | + md=np.array(data).reshape(-1,1) | ||
2057 | + # construction of models and calculation of criteria | ||
2058 | + nb_components=1 | ||
2059 | + nb_log_max=n_components_range[0] | ||
2060 | + log_max=0 | ||
2061 | + # chooses the number of components based on the maximum likelihood value (maxlogv) | ||
2062 | + for n_comp in n_components_range: | ||
2063 | + gmm = GaussianMixture(n_components=n_comp).fit(md) | ||
2064 | + aic.append(abs(gmm.aic(md))) | ||
2065 | + bic.append(abs(gmm.bic(md))) | ||
2066 | + maxlogv.append(gmm.lower_bound_) | ||
2067 | + if gmm.lower_bound_== max(maxlogv) : # takes the maximum | ||
2068 | + nb_components=n_comp | ||
2069 | + # if there is convergence, keep the first maximum found | ||
2070 | + if abs(gmm.lower_bound_-log_max)<0.02 : #threshold=0.02 | ||
2071 | + nb_components=nb_log_max | ||
2072 | + break | ||
2073 | + log_max=max(maxlogv) | ||
2074 | + nb_log_max=n_comp | ||
2075 | + | ||
2076 | + # plot with the appropriate number of components | ||
2077 | + obs=np.array(data).reshape(-1,1) | ||
2078 | + g = GaussianMixture(n_components=nb_components) | ||
2079 | + g.fit(obs) | ||
2080 | + weights = g.weights_ | ||
2081 | + means = g.means_ | ||
2082 | + covariances = g.covariances_ | ||
2083 | + | ||
2084 | + D = obs.ravel() | ||
2085 | + xmin = D.min() | ||
2086 | + xmax = D.max() | ||
2087 | + x = np.linspace(xmin,xmax,1000) | ||
2088 | + colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green'] | ||
2089 | + # prepare the dictionary to save the parameters | ||
2090 | + summary_data={} | ||
2091 | + summary_data["measure"]= name_data | ||
2092 | + summary_data["weights"]=[] | ||
2093 | + summary_data["means"]=[] | ||
2094 | + summary_data["std"]=[] | ||
2095 | + # plot | ||
2096 | + for i in range(nb_components): | ||
2097 | + mean = means[i] | ||
2098 | + sigma = np.sqrt(covariances[i]) | ||
2099 | + weight = weights[i] | ||
2100 | + plt.plot(x,(weights[i]*st.norm.pdf(x,mean,sigma))[0], c=colors[i]) | ||
2101 | + summary_data["means"].append(str(mean)) | ||
2102 | + summary_data["std"].append(str(sigma)) | ||
2103 | + summary_data["weights"].append(str(weight)) | ||
2104 | + axes=plt.gca() | ||
2105 | + plt.title("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(nb_fichiers)+" structures)") | ||
2106 | + # save in a json | ||
2107 | + with open (name_data + " .json", 'w', encoding='utf-8') as f: | ||
2108 | + json.dump(summary_data, f, indent=4) | ||
2109 | + | ||
2043 | def GMM_tot(data, name_data, nb_fichiers, couleur) : | 2110 | def GMM_tot(data, name_data, nb_fichiers, couleur) : |
2044 | ''' | 2111 | ''' |
2045 | Plot the sum of the Gaussians (without the histograms) | 2112 | Plot the sum of the Gaussians (without the histograms) |
... | @@ -2468,7 +2535,7 @@ def graph_torsion_h_RNA(): | ... | @@ -2468,7 +2535,7 @@ def graph_torsion_h_RNA(): |
2468 | 2535 | ||
2469 | def graph_plans_h_RNA(): | 2536 | def graph_plans_h_RNA(): |
2470 | 2537 | ||
2471 | - df=pd.read_csv(os.path.abspath(runDir + "results/HiRE-RNA/angles/angles_plans_hire_RNA.csv")) | 2538 | + df=pd.read_csv(os.path.abspath(runDir + "/results/HiRE-RNA/angles/angles_plans_hire_RNA.csv")) |
2472 | 2539 | ||
2473 | p_c1p_psuiv=list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])]) | 2540 | p_c1p_psuiv=list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])]) |
2474 | c1p_psuiv_c1psuiv=list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])]) | 2541 | c1p_psuiv_c1psuiv=list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])]) |
... | @@ -2487,6 +2554,112 @@ def graph_plans_h_RNA(): | ... | @@ -2487,6 +2554,112 @@ def graph_plans_h_RNA(): |
2487 | plt.savefig("GMM des angles plans (hire-RNA) (100 structures).png") | 2554 | plt.savefig("GMM des angles plans (hire-RNA) (100 structures).png") |
2488 | plt.close() | 2555 | plt.close() |
2489 | 2556 | ||
2557 | +''' | ||
2558 | +Functions for making measurements on pairings | ||
2559 | +''' | ||
2560 | + | ||
2561 | +def dictio(ld): | ||
2562 | + ''' | ||
2563 | + creation of a dictionary | ||
2564 | + key = pdb identifier of the structure | ||
2565 | + value = list of RNA chains | ||
2566 | + ''' | ||
2567 | + pdb=[] | ||
2568 | + for f in ld: | ||
2569 | + pdb_id=str.split(f, '_')[0] | ||
2570 | + if pdb_id not in pdb: #we create a list of distinct structures | ||
2571 | + pdb.append(pdb_id) | ||
2572 | + for pdb_id in pdb:#for all structures found | ||
2573 | + liste_chaines=[] | ||
2574 | + for f in ld: | ||
2575 | + if (len(f)<10): #unmapped to a Rfam family | ||
2576 | + chaine=str.split(f, '_') | ||
2577 | + if chaine[0]==pdb_id: | ||
2578 | + id_chain=chaine[2] | ||
2579 | + liste_chaines.append(id_chain) | ||
2580 | + if liste_chaines != []: | ||
2581 | + dictionnaire[pdb_id]=liste_chaines | ||
2582 | + return (dictionnaire) | ||
2583 | + | ||
2584 | +def dist_pointes(res, pair): | ||
2585 | + ''' | ||
2586 | + measure of the distance between the tips of the paired nucleotides (B1 / B1 or B1 / B2 or B2 / B2) | ||
2587 | + ''' | ||
2588 | + dist=[] | ||
2589 | + d=0 | ||
2590 | + if res.get_resname()=='A' or res.get_resname()=='G' :# different cases if 1 aromatic cycle or 2 | ||
2591 | + atom_res=pos_b2(res) | ||
2592 | + if pair.get_resname()=='A' or pair.get_resname()=='G' : | ||
2593 | + atom_pair=pos_b2(pair) | ||
2594 | + if pair.get_resname()=='C' or pair.get_resname()=='U' : | ||
2595 | + atom_pair=pos_b1(pair) | ||
2596 | + | ||
2597 | + if res.get_resname()=='C' or res.get_resname()=='U' : | ||
2598 | + atom_res=pos_b1(res) | ||
2599 | + if pair.get_resname()=='A' or pair.get_resname()=='G' : | ||
2600 | + atom_pair=pos_b2(pair) | ||
2601 | + if pair.get_resname()=='C' or pair.get_resname()=='U' : | ||
2602 | + atom_pair=pos_b1(pair) | ||
2603 | + | ||
2604 | + dist=distance(atom_res, atom_pair) | ||
2605 | + | ||
2606 | + return dist | ||
2607 | + | ||
2608 | +def angle_c1_b1(res,pair): | ||
2609 | + ''' | ||
2610 | + measurement of the plane angles formed by the vectors C1-> B1 of the paired nucleotides | ||
2611 | + ''' | ||
2612 | + if res.get_resname()=='A' or res.get_resname()=='G' or res.get_resname()=='C' or res.get_resname()=='U' : | ||
2613 | + atom_c4_res = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ] | ||
2614 | + atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ] | ||
2615 | + atom_b1_res=pos_b1(res) | ||
2616 | + c4_res=Vector(atom_c4_res[0]) | ||
2617 | + c1_res=Vector(atom_c1p_res[0]) | ||
2618 | + b1_res=Vector(atom_b1_res) | ||
2619 | + if pair.get_resname()=='A' or pair.get_resname()=='G' or pair.get_resname()=='C' or pair.get_resname()=='U' : | ||
2620 | + atom_c4_pair = [ atom.get_coord() for atom in pair if "C4'" in atom.get_fullname() ] | ||
2621 | + atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ] | ||
2622 | + atom_b1_pair=pos_b1(pair) | ||
2623 | + c4_pair=Vector(atom_c4_pair[0]) | ||
2624 | + c1_pair=Vector(atom_c1p_pair[0]) | ||
2625 | + b1_pair=Vector(atom_b1_pair) | ||
2626 | + #we calculate the 4 plane angles including these vectors | ||
2627 | + | ||
2628 | + a=calc_angle(c4_res, c1_res, b1_res)*(180/np.pi) | ||
2629 | + b=calc_angle(c1_res, b1_res, b1_pair)*(180/np.pi) | ||
2630 | + c=calc_angle(b1_res, b1_pair, c1_pair)*(180/np.pi) | ||
2631 | + d=calc_angle(b1_pair, c1_pair, c4_pair)*(180/np.pi) | ||
2632 | + | ||
2633 | + | ||
2634 | + angles=[a, b, c, d] | ||
2635 | + return angles | ||
2636 | + | ||
2637 | +def graphe(type_LW, angle_1, angle_2, angle_3, angle_4, distance, nb): | ||
2638 | + ''' | ||
2639 | + function to plot the statistical figures you want | ||
2640 | + By type of pairing: | ||
2641 | + Superposition of GMMs of plane angles | ||
2642 | + Superposition of the histogram and the GMM of the distances | ||
2643 | + all in the same window | ||
2644 | + ''' | ||
2645 | + | ||
2646 | + figure = plt.figure(figsize = (10, 10)) | ||
2647 | + plt.gcf().subplots_adjust(left = 0.1, bottom = 0.1, right = 0.9, top = 0.9, wspace = 0, hspace = 0.5) | ||
2648 | + | ||
2649 | + plt.subplot(2, 1, 1) | ||
2650 | + | ||
2651 | + GMM_tot(angle_1, "C4'-C1'-B1", nb, 'cyan' ) | ||
2652 | + GMM_tot(angle_2, "C1'-B1-B1pair", nb, 'magenta') | ||
2653 | + GMM_tot(angle_3, "B1-B1pair-C1'pair", nb, "yellow") | ||
2654 | + GMM_tot(angle_4, "B1pair-C1'pair-C4'pair", nb, 'olive') | ||
2655 | + plt.xlabel("Angle(degré)") | ||
2656 | + plt.title("GMM des angles plans pour les appariements " +type_LW +" (" +str(nb)+ " valeurs)", fontsize=10) | ||
2657 | + | ||
2658 | + plt.subplot(2, 1, 2) | ||
2659 | + GMM_histo_without_saving(distance, "Distance pointes " + type_LW, "Distance (Angström)", "Densité", nb) | ||
2660 | + | ||
2661 | + plt.savefig("Mesures appariements " +type_LW+ " (" +str(nb)+ " valeurs).png" ) | ||
2662 | + plt.close() | ||
2490 | 2663 | ||
2491 | 2664 | ||
2492 | 2665 | ||
... | @@ -2623,12 +2796,13 @@ if __name__ == "__main__": | ... | @@ -2623,12 +2796,13 @@ if __name__ == "__main__": |
2623 | #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0]) | 2796 | #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0]) |
2624 | #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv') | 2797 | #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv') |
2625 | 2798 | ||
2626 | - #exit() | 2799 | + |
2800 | + exit() | ||
2627 | f_prec=os.listdir(path_to_3D_data + "rna_only")[0] | 2801 | f_prec=os.listdir(path_to_3D_data + "rna_only")[0] |
2628 | for f in os.listdir(path_to_3D_data + "rna_only")[:100]: | 2802 | for f in os.listdir(path_to_3D_data + "rna_only")[:100]: |
2629 | #joblist.append(Job(function=dist_atoms, args=(f,))) | 2803 | #joblist.append(Job(function=dist_atoms, args=(f,))) |
2630 | #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,))) | 2804 | #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,))) |
2631 | - #joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,))) | 2805 | + joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,))) |
2632 | joblist.append(Job(function=angles_plans_hire_RNA, args=(f,))) | 2806 | joblist.append(Job(function=angles_plans_hire_RNA, args=(f,))) |
2633 | 2807 | ||
2634 | 2808 | ||
... | @@ -2676,7 +2850,11 @@ if __name__ == "__main__": | ... | @@ -2676,7 +2850,11 @@ if __name__ == "__main__": |
2676 | graph_angles_torsion() | 2850 | graph_angles_torsion() |
2677 | concatenate('/results/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv') | 2851 | concatenate('/results/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv') |
2678 | graph_dist_atoms_h_RNA() | 2852 | graph_dist_atoms_h_RNA() |
2853 | + | ||
2679 | concatenate('/results/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv') | 2854 | concatenate('/results/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv') |
2680 | - graph_torsion_hire_RNA() | 2855 | + concatenate('/results/HiRE-RNA/angles/', 'angles_plans_hire_RNA.csv') |
2856 | + graph_torsion_h_RNA() | ||
2857 | + graph_plans_h_RNA() | ||
2858 | + | ||
2681 | graph_eta_theta() | 2859 | graph_eta_theta() |
2682 | ''' | 2860 | ''' | ... | ... |
-
Please register or login to post a comment