Aglaé TABOT

New functions : angles_plans_hire_RNA, histogram, GMM_histo

...@@ -1899,6 +1899,140 @@ def angles_torsion_hire_RNA(f): ...@@ -1899,6 +1899,140 @@ def angles_torsion_hire_RNA(f):
1899 idxQueue.put(thr_idx) # replace the thread index in the queue 1899 idxQueue.put(thr_idx) # replace the thread index in the queue
1900 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 1900 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1901 1901
1902 +def angles_plans_hire_RNA(f):
1903 + '''
1904 + Measures the plane angles involving C1' and B1 atoms
1905 + Saves the results in a dataframe
1906 + '''
1907 + name=str.split(f,'.')[0]
1908 + liste_angles_plans=[]
1909 + last_p=[]
1910 + last_c1p=[]
1911 +
1912 + global idxQueue
1913 + thr_idx = idxQueue.get()
1914 +
1915 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})")
1916 +
1917 + os.makedirs(runDir+"/results/plane_angles_hRNA/", exist_ok=True)
1918 +
1919 + parser=MMCIFParser()
1920 + s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
1921 + chain = next(s[0].get_chains())
1922 + residues=list(chain.get_residues())
1923 + pbar = tqdm(total=len(residues), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} angles_torsion_hire_RNA", unit="residu", leave=False)
1924 + pbar.update(0)
1925 + for res in chain :
1926 + p_c1p_psuiv=np.nan
1927 + c1p_psuiv_c1psuiv=np.nan
1928 + if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] :
1929 + atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
1930 + atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
1931 +
1932 + if len(last_p)<1 or len(last_c1p)<1 or len(atom_p)<1 :
1933 + p_c1p_psuiv=p_c1p_psuiv
1934 + else :
1935 + p_prec=Vector(last_p[0])
1936 + c1p_prec=Vector(last_c1p[0])
1937 + p=Vector(atom_p[0])
1938 + p_c1p_psuiv=calc_angle(p_prec, c1p_prec, p)*(180/np.pi)
1939 +
1940 + if len(atom_c1p)<1 or len(last_c1p)<1 or len(atom_p)<1:
1941 + c1p_psuiv_c1psuiv=c1p_psuiv_c1psuiv
1942 + else :
1943 + c1p_prec=Vector(last_c1p[0])
1944 + p=Vector(atom_p[0])
1945 + c1p=Vector(atom_c1p[0])
1946 + c1p_psuiv_c1psuiv=calc_angle(c1p_prec, p, c1p)*(180/np.pi)
1947 +
1948 + last_p=atom_p
1949 + last_c1p=atom_c1p
1950 + liste_angles_plans.append([res.get_resname(), p_c1p_psuiv, c1p_psuiv_c1psuiv])
1951 + pbar.update(1)
1952 + df=pd.DataFrame(liste_angles_plans, columns=["Residu", "P-C1'-P°", "C1'-P°-C1'°"])
1953 + pbar.close()
1954 +
1955 +
1956 + df.to_csv(runDir + '/results/plane_angles_hRNA/' + 'angles_plans_hire_RNA '+name+'.csv')
1957 + idxQueue.put(thr_idx) # replace the thread index in the queue
1958 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1959 +
1960 +def histogram(data, name_data, x, y, nb):
1961 + '''
1962 + Plot histograms
1963 + '''
1964 +
1965 + plt.hist(data,color="green",edgecolor='black', linewidth=1.2,bins=50, density=True)
1966 + plt.xlabel(x)
1967 + plt.ylabel(y)
1968 +
1969 +def GMM_histo(data, name_data, x, y, nb_fichiers) :
1970 + '''
1971 + Plot Gaussian-Mixture-Model on histograms
1972 + '''
1973 + histogramme(data, name_data, x, y, nb_fichiers)#plot the histogram
1974 +
1975 + n_max = 8 # number of possible values for n_components
1976 + n_components_range = np.arange(n_max)+1
1977 + aic = []
1978 + bic = []
1979 + maxlogv=[]
1980 + md=np.array(data).reshape(-1,1)
1981 + # construction of models and calculation of criteria
1982 + nb_components=1
1983 + nb_log_max=n_components_range[0]
1984 + log_max=0
1985 + # chooses the number of components based on the maximum likelihood value (maxlogv)
1986 + for n_comp in n_components_range:
1987 + gmm = GaussianMixture(n_components=n_comp).fit(md)
1988 + aic.append(abs(gmm.aic(md)))
1989 + bic.append(abs(gmm.bic(md)))
1990 + maxlogv.append(gmm.lower_bound_)
1991 + if gmm.lower_bound_== max(maxlogv) : # takes the maximum
1992 + nb_components=n_comp
1993 + # if there is convergence, keep the first maximum found
1994 + if abs(gmm.lower_bound_-log_max)<0.02 : #threshold=0.02
1995 + nb_components=nb_log_max
1996 + break
1997 + log_max=max(maxlogv)
1998 + nb_log_max=n_comp
1999 +
2000 + # plot with the appropriate number of components
2001 + obs=np.array(data).reshape(-1,1)
2002 + g = GaussianMixture(n_components=nb_components)
2003 + g.fit(obs)
2004 + weights = g.weights_
2005 + means = g.means_
2006 + covariances = g.covariances_
2007 +
2008 + D = obs.ravel()
2009 + xmin = D.min()
2010 + xmax = D.max()
2011 + x = np.linspace(xmin,xmax,1000)
2012 + colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
2013 + # prepare the dictionary to save the parameters
2014 + summary_data={}
2015 + summary_data["measure"]= name_data
2016 + summary_data["weights"]=[]
2017 + summary_data["means"]=[]
2018 + summary_data["std"]=[]
2019 + # plot
2020 + for i in range(nb_components):
2021 + mean = means[i]
2022 + sigma = math.sqrt(covariances[i])
2023 + weight = weights[i]
2024 + plt.plot(x,weights[i]*stats.norm.pdf(x,mean,sigma), c=colors[i])
2025 + summary_data["means"].append(str(mean))
2026 + summary_data["std"].append(str(sigma))
2027 + summary_data["weights"].append(str(weight))
2028 + axes=plt.gca()
2029 + plt.title("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(nb_fichiers)+" structures)")
2030 +
2031 + # save in a json
2032 + with open (name_data +" "+str(nb_fichiers)+ " .json", 'w', encoding='utf-8') as f:
2033 + json.dump(summary_data, f, indent=4)
2034 +
2035 +
1902 2036
1903 if __name__ == "__main__": 2037 if __name__ == "__main__":
1904 2038
...@@ -2037,9 +2171,10 @@ if __name__ == "__main__": ...@@ -2037,9 +2171,10 @@ if __name__ == "__main__":
2037 #exit() 2171 #exit()
2038 f_prec=os.listdir(path_to_3D_data + "rna_only")[0] 2172 f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
2039 for f in os.listdir(path_to_3D_data + "rna_only")[:100]: 2173 for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
2040 - joblist.append(Job(function=dist_atoms, args=(f,))) 2174 + #joblist.append(Job(function=dist_atoms, args=(f,)))
2041 - joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,))) 2175 + #joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
2042 - joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,))) 2176 + #joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,)))
2177 + joblist.append(Job(function=angles_plans_hire_RNA, args=(f,)))
2043 2178
2044 2179
2045 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 2180 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
......