Showing
2 changed files
with
258 additions
and
37 deletions
... | @@ -159,14 +159,14 @@ class SelectivePortionSelector(object): | ... | @@ -159,14 +159,14 @@ class SelectivePortionSelector(object): |
159 | _select=Select() | 159 | _select=Select() |
160 | 160 | ||
161 | def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): | 161 | def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): |
162 | + # reuse and modification of the source code of Biopython | ||
163 | + # to have the 2 columns of numbering of residues numbered with the index_chain of DSSR | ||
162 | if isinstance(out_file, str): | 164 | if isinstance(out_file, str): |
163 | fp = open(out_file, "w") | 165 | fp = open(out_file, "w") |
164 | close_file = True | 166 | close_file = True |
165 | else: | 167 | else: |
166 | fp = out_file | 168 | fp = out_file |
167 | close_file = False | 169 | close_file = False |
168 | - #fp = open(out_file, "w") | ||
169 | - #close_file=True | ||
170 | atom_dict = defaultdict(list) | 170 | atom_dict = defaultdict(list) |
171 | 171 | ||
172 | for model in ioobj.structure.get_list(): | 172 | for model in ioobj.structure.get_list(): |
... | @@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): | ... | @@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): |
188 | chain_id = chain.get_id() | 188 | chain_id = chain.get_id() |
189 | if chain_id == " ": | 189 | if chain_id == " ": |
190 | chain_id = "." | 190 | chain_id = "." |
191 | - # This is used to write label_seq_id and increments from 1, | 191 | + # This is used to write label_seq_id, |
192 | # remaining blank for hetero residues | 192 | # remaining blank for hetero residues |
193 | 193 | ||
194 | prev_residue_type = "" | 194 | prev_residue_type = "" |
... | @@ -402,8 +402,9 @@ class Chain: | ... | @@ -402,8 +402,9 @@ class Chain: |
402 | resseq=int(resseq) | 402 | resseq=int(resseq) |
403 | index_chain=nums.at[i, "index_chain"] | 403 | index_chain=nums.at[i, "index_chain"] |
404 | nt=nums.at[i, "nt_name"] | 404 | nt=nums.at[i, "nt_name"] |
405 | - | 405 | + |
406 | - if nt == 'A' or nt == 'G' or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' : | 406 | + # particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G) |
407 | + if nt == 'A' or (nt == 'G' and (self.chain_label != '6n5s_1_A' or resseq != 201)) or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' : | ||
407 | res=chain[(' ', resseq, icode_res)] | 408 | res=chain[(' ', resseq, icode_res)] |
408 | else : #modified nucleotides (e.g. chain 5l4o_1_A) | 409 | else : #modified nucleotides (e.g. chain 5l4o_1_A) |
409 | het='H_' + nt | 410 | het='H_' + nt |
... | @@ -421,7 +422,15 @@ class Chain: | ... | @@ -421,7 +422,15 @@ class Chain: |
421 | res_atoms=res.get_atoms() | 422 | res_atoms=res.get_atoms() |
422 | new_residu_t=pdb.Residue.Residue(res_id, res_name, res.get_segid()) | 423 | new_residu_t=pdb.Residue.Residue(res_id, res_name, res.get_segid()) |
423 | for atom in list(res.get_atoms()): | 424 | for atom in list(res.get_atoms()): |
424 | - if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A']: | 425 | + # rename the remaining phosphate group to P, OP1, OP2, OP3 |
426 | + if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A'] and res_name != 'RIA': | ||
427 | + | ||
428 | + # RIA is a residue made up of 2 riboses and 2 phosphates, | ||
429 | + # so it has an O2A atom between the C2A and C1 'atoms, | ||
430 | + # and it also has an OP2 atom attached to one of its phosphates | ||
431 | + # (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A) | ||
432 | + # we do not modify the atom names of RIA residue | ||
433 | + | ||
425 | if atom.get_name() == 'PA': | 434 | if atom.get_name() == 'PA': |
426 | atom_name = 'P' | 435 | atom_name = 'P' |
427 | if atom.get_name() == 'O1A': | 436 | if atom.get_name() == 'O1A': |
... | @@ -1511,11 +1520,7 @@ class Pipeline: | ... | @@ -1511,11 +1520,7 @@ class Pipeline: |
1511 | if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): | 1520 | if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): |
1512 | # for the portions mapped to Rfam | 1521 | # for the portions mapped to Rfam |
1513 | os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") | 1522 | os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") |
1514 | - ''' | 1523 | + |
1515 | - if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"): | ||
1516 | - # extract chains of pure RNA | ||
1517 | - os.makedirs(path_to_3D_data + "rna_only") | ||
1518 | - ''' | ||
1519 | if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"): | 1524 | if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"): |
1520 | # extract chains of pure RNA | 1525 | # extract chains of pure RNA |
1521 | os.makedirs(path_to_3D_data + "rna_only") | 1526 | os.makedirs(path_to_3D_data + "rna_only") | ... | ... |
... | @@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no | ... | @@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no |
37 | from sklearn.mixture import GaussianMixture | 37 | from sklearn.mixture import GaussianMixture |
38 | import warnings | 38 | import warnings |
39 | from pandas.core.common import SettingWithCopyWarning | 39 | from pandas.core.common import SettingWithCopyWarning |
40 | +from joblib import Parallel, delayed | ||
40 | 41 | ||
41 | 42 | ||
42 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) | 43 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) |
... | @@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms): | ... | @@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms): |
1251 | result.append(res) | 1252 | result.append(res) |
1252 | return(result) | 1253 | return(result) |
1253 | 1254 | ||
1255 | +def liste_repres(fpath): | ||
1256 | + repres=[] | ||
1257 | + df=pd.read_csv(os.path.abspath(fpath)) | ||
1258 | + for i in range(df.shape[0]): | ||
1259 | + up_name=df["representative"][i] | ||
1260 | + if '+' in up_name: | ||
1261 | + up_name=up_name.split('+') | ||
1262 | + for i in range(len(up_name)): | ||
1263 | + chain=up_name[i].split('|') | ||
1264 | + chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2] | ||
1265 | + repres.append(chain+'.cif') | ||
1266 | + else : | ||
1267 | + up_name=up_name.split('|') | ||
1268 | + low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2] | ||
1269 | + repres.append(low_name+'.cif') | ||
1270 | + | ||
1271 | + return repres | ||
1272 | + | ||
1273 | + | ||
1254 | def get_euclidian_distance(L1, L2): | 1274 | def get_euclidian_distance(L1, L2): |
1255 | """ | 1275 | """ |
1256 | Returns the distance between two points (coordinates in lists) | 1276 | Returns the distance between two points (coordinates in lists) |
... | @@ -1504,14 +1524,16 @@ def measure_from_structure(f): | ... | @@ -1504,14 +1524,16 @@ def measure_from_structure(f): |
1504 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | 1524 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) |
1505 | parser=MMCIFParser() | 1525 | parser=MMCIFParser() |
1506 | s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) | 1526 | s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) |
1507 | - | 1527 | + |
1508 | - measures_aa(name, s, thr_idx) | 1528 | + #pyle_measures(name, s, thr_idx) |
1529 | + #measures_aa(name, s, thr_idx) | ||
1509 | if DO_HIRE_RNA_MEASURES: | 1530 | if DO_HIRE_RNA_MEASURES: |
1510 | measures_hrna(name, s, thr_idx) | 1531 | measures_hrna(name, s, thr_idx) |
1511 | measures_hrna_basepairs(name, s, thr_idx) | 1532 | measures_hrna_basepairs(name, s, thr_idx) |
1512 | if DO_WADLEY_ANALYSIS: | 1533 | if DO_WADLEY_ANALYSIS: |
1513 | - measures_wadley(name, s, thr_idx) | 1534 | + #measures_wadley(name, s, thr_idx) |
1514 | - | 1535 | + pyle_measures(name, s, thr_idx) |
1536 | + | ||
1515 | idxQueue.put(thr_idx) # replace the thread index in the queue | 1537 | idxQueue.put(thr_idx) # replace the thread index in the queue |
1516 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | 1538 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") |
1517 | 1539 | ||
... | @@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx): | ... | @@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx): |
1931 | return results | 1953 | return results |
1932 | 1954 | ||
1933 | @trace_unhandled_exceptions | 1955 | @trace_unhandled_exceptions |
1956 | +def pyle_measures(name, s, thr_idx): | ||
1957 | + | ||
1958 | + if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')): | ||
1959 | + return | ||
1960 | + | ||
1961 | + liste_dist=[] | ||
1962 | + #classes=[] | ||
1963 | + #for i in range(0, 150, 5): | ||
1964 | + #classes.append([i, i+5]) | ||
1965 | + #classes.append([150, 300]) | ||
1966 | + #occur_p_p=len(classes)*[0] | ||
1967 | + #occur_p_c1=len(classes)*[0] | ||
1968 | + #occur_p_c4=len(classes)*[0] | ||
1969 | + #occur_c1_c1=len(classes)*[0] | ||
1970 | + #occur_c4_c4=len(classes)*[0] | ||
1971 | + #nb_occurs=[] | ||
1972 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})") | ||
1973 | + | ||
1974 | + chain = next(s[0].get_chains()) | ||
1975 | + #residues=list(chain.get_residues()) | ||
1976 | + for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False): | ||
1977 | + #res1=chain[i] | ||
1978 | + if res1.get_resname() in ["A", "C", "G", "U"]: | ||
1979 | + resnum1=list(res1.get_id())[1] | ||
1980 | + atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"] | ||
1981 | + atom_c1p_1 = [ atom.get_coord() for atom in res1 if "C1'" in atom.get_fullname() ] | ||
1982 | + atom_c4p_1 = [ atom.get_coord() for atom in res1 if "C4'" in atom.get_fullname() ] | ||
1983 | + for res2 in chain: | ||
1984 | + resnum2=list(res2.get_id())[1] | ||
1985 | + if resnum2-resnum1 < 4 : | ||
1986 | + continue | ||
1987 | + p_p=np.nan | ||
1988 | + p_c4p=np.nan | ||
1989 | + p_c1p=np.nan | ||
1990 | + c4p_c4p=np.nan | ||
1991 | + c1p_c1p=np.nan | ||
1992 | + #res2=chain[j] | ||
1993 | + if res2.get_resname() in ["A", "C", "G", "U"]: | ||
1994 | + | ||
1995 | + atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"] | ||
1996 | + atom_c1p_2 = [ atom.get_coord() for atom in res2 if "C1'" in atom.get_fullname() ] | ||
1997 | + atom_c4p_2 = [ atom.get_coord() for atom in res2 if "C4'" in atom.get_fullname() ] | ||
1998 | + | ||
1999 | + p_p = get_euclidian_distance(atom_p_1, atom_p_2) | ||
2000 | + p_c4p= get_euclidian_distance(atom_p_1, atom_c4p_2) | ||
2001 | + p_c1p= get_euclidian_distance(atom_p_1, atom_c1p_2) | ||
2002 | + c4p_c4p= get_euclidian_distance(atom_c4p_1, atom_c4p_2) | ||
2003 | + c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2) | ||
2004 | + | ||
2005 | + liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p]) | ||
2006 | + ''' | ||
2007 | + for x in range(len(classes)): | ||
2008 | + if classes[x][0] <= p_p <= classes[x][1]: | ||
2009 | + occur_p_p[x]=occur_p_p[x]+1 | ||
2010 | + if classes[x][0] <= p_c4p <= classes[x][1]: | ||
2011 | + occur_p_c4[x]=occur_p_c4[x]+1 | ||
2012 | + if classes[x][0] <= p_c1p <= classes[x][1]: | ||
2013 | + occur_p_c1[x]=occur_p_c1[x]+1 | ||
2014 | + if classes[x][0] <= c4p_c4p <= classes[x][1]: | ||
2015 | + occur_c4_c4[x]=occur_c4_c4[x]+1 | ||
2016 | + if classes[x][0] <= c1p_c1p <= classes[x][1]: | ||
2017 | + occur_c1_c1[x]=occur_c1_c1[x]+1 | ||
2018 | + ''' | ||
2019 | + #for x in range(len(classes)): | ||
2020 | + # for i in range(len(liste_dist)): | ||
2021 | + # if classes[x][0] <= liste_dist[i][4] <= classes[x][1]: | ||
2022 | + # occur_p_p[x]=occur_p_p[x]+1 | ||
2023 | + # if classes[x][0] <= liste_dist[i][5] <= classes[x][1]: | ||
2024 | + # occur_p_c4[x]=occur_p_c4[x]+1 | ||
2025 | + # if classes[x][0] <= liste_dist[i][6] <= classes[x][1]: | ||
2026 | + # occur_p_c1[x]=occur_p_c1[x]+1 | ||
2027 | + # if classes[x][0] <= liste_dist[i][7] <= classes[x][1]: | ||
2028 | + # occur_c4_c4[x]=occur_c4_c4[x]+1 | ||
2029 | + # if classes[x][0] <= liste_dist[i][8] <= classes[x][1]: | ||
2030 | + # occur_c1_c1[x]=occur_c1_c1[x]+1 | ||
2031 | + #nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]]) | ||
2032 | + #df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"]) | ||
2033 | + # return df | ||
2034 | + # nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4]) | ||
2035 | + # print(nb_occurs) | ||
2036 | + # return nb_occurs | ||
2037 | + | ||
2038 | + | ||
2039 | + df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]) | ||
2040 | + df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv") | ||
2041 | + | ||
2042 | +@trace_unhandled_exceptions | ||
2043 | +def count_occur_pyle_dist(fpath): | ||
2044 | + | ||
2045 | + global idxQueue | ||
2046 | + thr_idx = idxQueue.get() | ||
2047 | + setproctitle(f"Worker {thr_idx+1} : Extract occurences of {fpath}") | ||
2048 | + | ||
2049 | + liste=os.listdir(fpath) | ||
2050 | + pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing ", leave=False) | ||
2051 | + df = pd.read_csv(os.path.abspath(fpath + liste.pop())) | ||
2052 | + occur_p_p=list(df["P-P"]) | ||
2053 | + occur_p_c1=list(df["P-C1'"]) | ||
2054 | + occur_p_c4=list(df["P-C4'"]) | ||
2055 | + occur_c1_c1=list(df["C1'-C1'"]) | ||
2056 | + occur_c4_c4=list(df["C4'-C4'"]) | ||
2057 | + nb_occurs=[] | ||
2058 | + for f in range(len(liste)): | ||
2059 | + df = pd.read_csv(os.path.abspath(fpath + liste.pop())) | ||
2060 | + # print(liste[f]) | ||
2061 | + for k in range(df.shape[0]): | ||
2062 | + occur_p_p[k]=occur_p_p[k]+df["P-P"][k] | ||
2063 | + occur_p_c1[k]=occur_p_c1[k]+df["P-C1'"][k] | ||
2064 | + occur_p_c4[k]=occur_p_c4[k]+df["P-C4'"][k] | ||
2065 | + occur_c1_c1[k]=occur_c1_c1[k]+df["C1'-C1'"][k] | ||
2066 | + occur_c4_c4[k]=occur_c4_c4[k]+df["C4'-C4'"][k] | ||
2067 | + pbar.update(1) | ||
2068 | + nb_occurs=[occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4] | ||
2069 | + # return(list(df["classe"]), occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4) | ||
2070 | + df = pd.DataFrame(nb_occurs, columns=list(df["classe"])) | ||
2071 | + | ||
2072 | + df.to_csv(runDir + "/results/geometry/Pyle/classes_dist/occurences_dist.csv") | ||
2073 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
2074 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
2075 | + | ||
2076 | + | ||
2077 | +@trace_unhandled_exceptions | ||
1934 | def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) : | 2078 | def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) : |
1935 | """ | 2079 | """ |
1936 | Plot Gaussian-Mixture-Model (with or without histograms) | 2080 | Plot Gaussian-Mixture-Model (with or without histograms) |
... | @@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) | ... | @@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) |
2012 | else: | 2156 | else: |
2013 | D = obs.ravel() | 2157 | D = obs.ravel() |
2014 | xmin = D.min() | 2158 | xmin = D.min() |
2015 | - xmax = min(10.0, D.max()) | 2159 | + #xmax = min(10.0, D.max()) |
2160 | + xmax = D.max() | ||
2016 | x = np.linspace(xmin,xmax,1000) | 2161 | x = np.linspace(xmin,xmax,1000) |
2017 | colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green'] | 2162 | colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green'] |
2018 | 2163 | ||
... | @@ -2372,7 +2517,7 @@ def gmm_wadley(): | ... | @@ -2372,7 +2517,7 @@ def gmm_wadley(): |
2372 | plt.savefig("GMM_flat_angles_pyle_model.png") | 2517 | plt.savefig("GMM_flat_angles_pyle_model.png") |
2373 | plt.close() | 2518 | plt.close() |
2374 | 2519 | ||
2375 | - # Torsion anfles | 2520 | + # Torsion angles |
2376 | eta=[] | 2521 | eta=[] |
2377 | theta=[] | 2522 | theta=[] |
2378 | eta_prime=[] | 2523 | eta_prime=[] |
... | @@ -2420,6 +2565,58 @@ def gmm_wadley(): | ... | @@ -2420,6 +2565,58 @@ def gmm_wadley(): |
2420 | os.chdir(runDir) | 2565 | os.chdir(runDir) |
2421 | setproctitle("GMM (Pyle model) finished") | 2566 | setproctitle("GMM (Pyle model) finished") |
2422 | 2567 | ||
2568 | +def gmm_pyle_type(ntpair, data): | ||
2569 | + | ||
2570 | + setproctitle(f"GMM (Pyle {ntpair} )") | ||
2571 | + | ||
2572 | + os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True) | ||
2573 | + os.chdir(runDir + "/results/figures/GMM/Pyle/distances/") | ||
2574 | + | ||
2575 | + p_p=list(data["P-P"][~ np.isnan(data["P-P"])]) | ||
2576 | + p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])]) | ||
2577 | + p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])]) | ||
2578 | + c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])]) | ||
2579 | + c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])]) | ||
2580 | + print(len(p_p)) | ||
2581 | + # res2=list(data["resnum2"]) | ||
2582 | + # res1=list(data["resnum1"]) | ||
2583 | + # diff=[] | ||
2584 | + # for i in range(len(res1)): | ||
2585 | + # diff.append(res2[i]-res1[i]) | ||
2586 | + # print(diff[:100]) | ||
2587 | + | ||
2588 | + GMM_histo(p_p, f"Distance P-P between {ntpair} tips for {str(len(p_p))} values", toric=False, hist=False, col="cyan") | ||
2589 | + GMM_histo(p_c4p, f"Distance P-C4' between {ntpair} tips", toric=False, hist=False, col="tomato") | ||
2590 | + GMM_histo(p_c1p, f"Distance P-C1' between {ntpair} tips", toric=False, hist=False, col="goldenrod") | ||
2591 | + GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair} tips", toric=False, hist=False, col="magenta") | ||
2592 | + GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair} tips", toric=False, hist=False, col="black") | ||
2593 | + # GMM_histo(diff, f"Gap between {ntpair} tips", toric=False, hist=False, col="tomato") | ||
2594 | + plt.xlabel("Distance (Angströms)") | ||
2595 | + | ||
2596 | + # plt.xlabel("Number of residues") | ||
2597 | + plt.ylabel("Distance (Angströms)") | ||
2598 | + plt.title(f"GMM of distances for {ntpair} ", fontsize=10) | ||
2599 | + | ||
2600 | + # plt.savefig(f"Longueurs_Pyle_{ntpair}.png" ) | ||
2601 | + plt.savefig(f"Distances_Pyle_{ntpair}.png" ) | ||
2602 | + plt.close() | ||
2603 | + setproctitle(f"GMM (Pyle {ntpair} distances) finished") | ||
2604 | + | ||
2605 | +def gmm_pyle(): | ||
2606 | + | ||
2607 | + setproctitle("GMM (Pyle model)") | ||
2608 | + | ||
2609 | + df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv")) | ||
2610 | + | ||
2611 | + # dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"] | ||
2612 | + data=df | ||
2613 | + if len(data): | ||
2614 | + for b1 in ['A','C','G','U']: | ||
2615 | + for b2 in ['A','C','G','U']: | ||
2616 | + thisbases = data[(data.res1 == b1)&(data.res2 == b2)] | ||
2617 | + if len(thisbases): | ||
2618 | + gmm_pyle_type(b1+b2, thisbases) | ||
2619 | + | ||
2423 | @trace_unhandled_exceptions | 2620 | @trace_unhandled_exceptions |
2424 | def gmm_hrna(): | 2621 | def gmm_hrna(): |
2425 | """ | 2622 | """ |
... | @@ -2669,7 +2866,11 @@ def merge_jsons(): | ... | @@ -2669,7 +2866,11 @@ def merge_jsons(): |
2669 | os.remove(f) | 2866 | os.remove(f) |
2670 | except FileNotFoundError: | 2867 | except FileNotFoundError: |
2671 | pass | 2868 | pass |
2672 | - | 2869 | + |
2870 | +@trace_unhandled_exceptions | ||
2871 | +def loop(f): | ||
2872 | + return pd.read_csv(f) | ||
2873 | + | ||
2673 | @trace_unhandled_exceptions | 2874 | @trace_unhandled_exceptions |
2674 | def concat_dataframes(fpath, outfilename): | 2875 | def concat_dataframes(fpath, outfilename): |
2675 | """ | 2876 | """ |
... | @@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename): | ... | @@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename): |
2682 | 2883 | ||
2683 | liste = os.listdir(fpath) | 2884 | liste = os.listdir(fpath) |
2684 | pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False) | 2885 | pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False) |
2685 | - | 2886 | + df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine="python") |
2686 | - df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) | 2887 | + #df=Parallel(n_jobs=-1, verbose=20)(delayed(loop)(os.path.abspath(fpath+liste[f])) for f in range (len(liste))) |
2888 | + #except : | ||
2889 | + # print(liste[f]) | ||
2890 | + | ||
2687 | pbar.update(1) | 2891 | pbar.update(1) |
2688 | for f in range(len(liste)): | 2892 | for f in range(len(liste)): |
2689 | - df = pd.read_csv(os.path.abspath(fpath + liste.pop())) | 2893 | + # try : |
2894 | + df = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine='python') | ||
2895 | + # except : | ||
2896 | + # print(liste[f]) | ||
2897 | + # continue | ||
2690 | df_tot = pd.concat([df_tot, df], ignore_index=True) | 2898 | df_tot = pd.concat([df_tot, df], ignore_index=True) |
2691 | pbar.update(1) | 2899 | pbar.update(1) |
2692 | - | 2900 | + #df = pd.concat(df, ignore_index=True) |
2901 | + #pbar.update(1) | ||
2902 | + #df.to_csv(fpath + outfilename) | ||
2693 | df_tot.to_csv(fpath + outfilename) | 2903 | df_tot.to_csv(fpath + outfilename) |
2694 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2904 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2695 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | 2905 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") |
... | @@ -2789,6 +2999,7 @@ if __name__ == "__main__": | ... | @@ -2789,6 +2999,7 @@ if __name__ == "__main__": |
2789 | elif opt=='--wadley': | 2999 | elif opt=='--wadley': |
2790 | DO_WADLEY_ANALYSIS = True | 3000 | DO_WADLEY_ANALYSIS = True |
2791 | os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True) | 3001 | os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True) |
3002 | + os.makedirs(runDir+"/results/geometry/Pyle/classes_dist/", exist_ok=True) | ||
2792 | os.makedirs(runDir+"/results/geometry/Pyle/angles/", exist_ok=True) | 3003 | os.makedirs(runDir+"/results/geometry/Pyle/angles/", exist_ok=True) |
2793 | os.makedirs(runDir+"/results/geometry/Pyle/pseudotorsions/", exist_ok=True) | 3004 | os.makedirs(runDir+"/results/geometry/Pyle/pseudotorsions/", exist_ok=True) |
2794 | os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True) | 3005 | os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True) |
... | @@ -2853,9 +3064,9 @@ if __name__ == "__main__": | ... | @@ -2853,9 +3064,9 @@ if __name__ == "__main__": |
2853 | joblist = [] | 3064 | joblist = [] |
2854 | 3065 | ||
2855 | # Do eta/theta plots | 3066 | # Do eta/theta plots |
2856 | - if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 3067 | + #if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
2857 | - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) | 3068 | + # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
2858 | - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) | 3069 | + # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
2859 | 3070 | ||
2860 | # Do distance matrices for each family excl. LSU/SSU (will be processed later) | 3071 | # Do distance matrices for each family excl. LSU/SSU (will be processed later) |
2861 | if DO_AVG_DISTANCE_MATRIX: | 3072 | if DO_AVG_DISTANCE_MATRIX: |
... | @@ -2881,21 +3092,24 @@ if __name__ == "__main__": | ... | @@ -2881,21 +3092,24 @@ if __name__ == "__main__": |
2881 | 3092 | ||
2882 | 3093 | ||
2883 | # Do geometric measures on all chains | 3094 | # Do geometric measures on all chains |
3095 | + #print(liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')) | ||
2884 | 3096 | ||
2885 | - # if n_unmapped_chains: | 3097 | + # print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0])) |
3098 | + if n_unmapped_chains: | ||
2886 | # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) | 3099 | # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) |
2887 | - # liste_struct = os.listdir(path_to_3D_data + "rna_only") | 3100 | + # liste_struct = os.listdir(path_to_3D_data + "rna_only") |
3101 | + liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv') | ||
2888 | # if '4zdo_1_E.cif' in liste_struct: | 3102 | # if '4zdo_1_E.cif' in liste_struct: |
2889 | # liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now | 3103 | # liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now |
2890 | # if '4zdp_1_E.cif' in liste_struct: | 3104 | # if '4zdp_1_E.cif' in liste_struct: |
2891 | # liste_struct.remove('4zdp_1_E.cif') | 3105 | # liste_struct.remove('4zdp_1_E.cif') |
2892 | - # for f in liste_struct: | 3106 | + for f in liste_struct: |
2893 | - # if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): | 3107 | + if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): |
2894 | - # joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances | 3108 | + joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances |
2895 | 3109 | ||
2896 | 3110 | ||
2897 | - # process_jobs(joblist) | 3111 | + process_jobs(joblist) |
2898 | - | 3112 | + #count_occur_pyle_dist(runDir + '/results/geometry/Pyle/classes_dist/') |
2899 | # Now process the memory-heavy tasks family by family | 3113 | # Now process the memory-heavy tasks family by family |
2900 | if DO_AVG_DISTANCE_MATRIX: | 3114 | if DO_AVG_DISTANCE_MATRIX: |
2901 | for f in LSU_set: | 3115 | for f in LSU_set: |
... | @@ -2913,12 +3127,13 @@ if __name__ == "__main__": | ... | @@ -2913,12 +3127,13 @@ if __name__ == "__main__": |
2913 | # per_chain_stats() # per chain base frequencies en basepair types | 3127 | # per_chain_stats() # per chain base frequencies en basepair types |
2914 | # seq_idty() # identity matrices from pre-computed .npy matrices | 3128 | # seq_idty() # identity matrices from pre-computed .npy matrices |
2915 | # stats_pairs() | 3129 | # stats_pairs() |
2916 | - | 3130 | + concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv') |
2917 | if n_unmapped_chains: | 3131 | if n_unmapped_chains: |
2918 | # general_stats() | 3132 | # general_stats() |
2919 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) | 3133 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) |
2920 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) | 3134 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) |
2921 | - # joblist = [] | 3135 | + joblist = [] |
3136 | + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances.csv'))) | ||
2922 | # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv'))) | 3137 | # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv'))) |
2923 | # if DO_HIRE_RNA_MEASURES: | 3138 | # if DO_HIRE_RNA_MEASURES: |
2924 | # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv'))) | 3139 | # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv'))) |
... | @@ -2935,9 +3150,10 @@ if __name__ == "__main__": | ... | @@ -2935,9 +3150,10 @@ if __name__ == "__main__": |
2935 | if DO_HIRE_RNA_MEASURES: | 3150 | if DO_HIRE_RNA_MEASURES: |
2936 | # joblist.append(Job(function=gmm_hrna, args=())) | 3151 | # joblist.append(Job(function=gmm_hrna, args=())) |
2937 | joblist.append(Job(function=gmm_hrna_basepairs, args=())) | 3152 | joblist.append(Job(function=gmm_hrna_basepairs, args=())) |
2938 | - if DO_WADLEY_ANALYSIS: | 3153 | + # if DO_WADLEY_ANALYSIS: |
2939 | - joblist.append(Job(function=gmm_wadley, args=())) | 3154 | + # joblist.append(Job(function=gmm_wadley, args=())) |
3155 | + # joblist.append(Job(function=gmm_pyle, args=())) | ||
2940 | if len(joblist): | 3156 | if len(joblist): |
2941 | process_jobs(joblist) | 3157 | process_jobs(joblist) |
2942 | - merge_jsons() | 3158 | + #merge_jsons() |
2943 | 3159 | ... | ... |
-
mentioned in commit e4282475
-
Please register or login to post a comment