Louis BECQUEY

Merge branch 'stage_aglae'

...@@ -159,14 +159,14 @@ class SelectivePortionSelector(object): ...@@ -159,14 +159,14 @@ class SelectivePortionSelector(object):
159 _select=Select() 159 _select=Select()
160 160
161 def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): 161 def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
162 + # reuse and modification of the source code of Biopython
163 + # to have the 2 columns of numbering of residues numbered with the index_chain of DSSR
162 if isinstance(out_file, str): 164 if isinstance(out_file, str):
163 fp = open(out_file, "w") 165 fp = open(out_file, "w")
164 close_file = True 166 close_file = True
165 else: 167 else:
166 fp = out_file 168 fp = out_file
167 close_file = False 169 close_file = False
168 - #fp = open(out_file, "w")
169 - #close_file=True
170 atom_dict = defaultdict(list) 170 atom_dict = defaultdict(list)
171 171
172 for model in ioobj.structure.get_list(): 172 for model in ioobj.structure.get_list():
...@@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False): ...@@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
188 chain_id = chain.get_id() 188 chain_id = chain.get_id()
189 if chain_id == " ": 189 if chain_id == " ":
190 chain_id = "." 190 chain_id = "."
191 - # This is used to write label_seq_id and increments from 1, 191 + # This is used to write label_seq_id,
192 # remaining blank for hetero residues 192 # remaining blank for hetero residues
193 193
194 prev_residue_type = "" 194 prev_residue_type = ""
...@@ -402,8 +402,9 @@ class Chain: ...@@ -402,8 +402,9 @@ class Chain:
402 resseq=int(resseq) 402 resseq=int(resseq)
403 index_chain=nums.at[i, "index_chain"] 403 index_chain=nums.at[i, "index_chain"]
404 nt=nums.at[i, "nt_name"] 404 nt=nums.at[i, "nt_name"]
405 - 405 +
406 - if nt == 'A' or nt == 'G' or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' : 406 + # particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G)
407 + if nt == 'A' or (nt == 'G' and (self.chain_label != '6n5s_1_A' or resseq != 201)) or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' :
407 res=chain[(' ', resseq, icode_res)] 408 res=chain[(' ', resseq, icode_res)]
408 else : #modified nucleotides (e.g. chain 5l4o_1_A) 409 else : #modified nucleotides (e.g. chain 5l4o_1_A)
409 het='H_' + nt 410 het='H_' + nt
...@@ -421,7 +422,15 @@ class Chain: ...@@ -421,7 +422,15 @@ class Chain:
421 res_atoms=res.get_atoms() 422 res_atoms=res.get_atoms()
422 new_residu_t=pdb.Residue.Residue(res_id, res_name, res.get_segid()) 423 new_residu_t=pdb.Residue.Residue(res_id, res_name, res.get_segid())
423 for atom in list(res.get_atoms()): 424 for atom in list(res.get_atoms()):
424 - if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A']: 425 + # rename the remaining phosphate group to P, OP1, OP2, OP3
426 + if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A'] and res_name != 'RIA':
427 +
428 + # RIA is a residue made up of 2 riboses and 2 phosphates,
429 + # so it has an O2A atom between the C2A and C1 'atoms,
430 + # and it also has an OP2 atom attached to one of its phosphates
431 + # (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A)
432 + # we do not modify the atom names of RIA residue
433 +
425 if atom.get_name() == 'PA': 434 if atom.get_name() == 'PA':
426 atom_name = 'P' 435 atom_name = 'P'
427 if atom.get_name() == 'O1A': 436 if atom.get_name() == 'O1A':
...@@ -1511,11 +1520,7 @@ class Pipeline: ...@@ -1511,11 +1520,7 @@ class Pipeline:
1511 if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): 1520 if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1512 # for the portions mapped to Rfam 1521 # for the portions mapped to Rfam
1513 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") 1522 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
1514 - ''' 1523 +
1515 - if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
1516 - # extract chains of pure RNA
1517 - os.makedirs(path_to_3D_data + "rna_only")
1518 - '''
1519 if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"): 1524 if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
1520 # extract chains of pure RNA 1525 # extract chains of pure RNA
1521 os.makedirs(path_to_3D_data + "rna_only") 1526 os.makedirs(path_to_3D_data + "rna_only")
......
...@@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no ...@@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no
37 from sklearn.mixture import GaussianMixture 37 from sklearn.mixture import GaussianMixture
38 import warnings 38 import warnings
39 from pandas.core.common import SettingWithCopyWarning 39 from pandas.core.common import SettingWithCopyWarning
40 +from joblib import Parallel, delayed
40 41
41 42
42 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) 43 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
...@@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms): ...@@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms):
1251 result.append(res) 1252 result.append(res)
1252 return(result) 1253 return(result)
1253 1254
1255 +def liste_repres(fpath):
1256 + repres=[]
1257 + df=pd.read_csv(os.path.abspath(fpath))
1258 + for i in range(df.shape[0]):
1259 + up_name=df["representative"][i]
1260 + if '+' in up_name:
1261 + up_name=up_name.split('+')
1262 + for i in range(len(up_name)):
1263 + chain=up_name[i].split('|')
1264 + chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2]
1265 + repres.append(chain+'.cif')
1266 + else :
1267 + up_name=up_name.split('|')
1268 + low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2]
1269 + repres.append(low_name+'.cif')
1270 +
1271 + return repres
1272 +
1273 +
1254 def get_euclidian_distance(L1, L2): 1274 def get_euclidian_distance(L1, L2):
1255 """ 1275 """
1256 Returns the distance between two points (coordinates in lists) 1276 Returns the distance between two points (coordinates in lists)
...@@ -1504,14 +1524,16 @@ def measure_from_structure(f): ...@@ -1504,14 +1524,16 @@ def measure_from_structure(f):
1504 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) 1524 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
1505 parser=MMCIFParser() 1525 parser=MMCIFParser()
1506 s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) 1526 s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
1507 - 1527 +
1508 - measures_aa(name, s, thr_idx) 1528 + #pyle_measures(name, s, thr_idx)
1529 + #measures_aa(name, s, thr_idx)
1509 if DO_HIRE_RNA_MEASURES: 1530 if DO_HIRE_RNA_MEASURES:
1510 measures_hrna(name, s, thr_idx) 1531 measures_hrna(name, s, thr_idx)
1511 measures_hrna_basepairs(name, s, thr_idx) 1532 measures_hrna_basepairs(name, s, thr_idx)
1512 if DO_WADLEY_ANALYSIS: 1533 if DO_WADLEY_ANALYSIS:
1513 - measures_wadley(name, s, thr_idx) 1534 + #measures_wadley(name, s, thr_idx)
1514 - 1535 + pyle_measures(name, s, thr_idx)
1536 +
1515 idxQueue.put(thr_idx) # replace the thread index in the queue 1537 idxQueue.put(thr_idx) # replace the thread index in the queue
1516 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 1538 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1517 1539
...@@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx): ...@@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx):
1931 return results 1953 return results
1932 1954
1933 @trace_unhandled_exceptions 1955 @trace_unhandled_exceptions
1956 +def pyle_measures(name, s, thr_idx):
1957 +
1958 + if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')):
1959 + return
1960 +
1961 + liste_dist=[]
1962 + #classes=[]
1963 + #for i in range(0, 150, 5):
1964 + #classes.append([i, i+5])
1965 + #classes.append([150, 300])
1966 + #occur_p_p=len(classes)*[0]
1967 + #occur_p_c1=len(classes)*[0]
1968 + #occur_p_c4=len(classes)*[0]
1969 + #occur_c1_c1=len(classes)*[0]
1970 + #occur_c4_c4=len(classes)*[0]
1971 + #nb_occurs=[]
1972 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")
1973 +
1974 + chain = next(s[0].get_chains())
1975 + #residues=list(chain.get_residues())
1976 + for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
1977 + #res1=chain[i]
1978 + if res1.get_resname() in ["A", "C", "G", "U"]:
1979 + resnum1=list(res1.get_id())[1]
1980 + atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"]
1981 + atom_c1p_1 = [ atom.get_coord() for atom in res1 if "C1'" in atom.get_fullname() ]
1982 + atom_c4p_1 = [ atom.get_coord() for atom in res1 if "C4'" in atom.get_fullname() ]
1983 + for res2 in chain:
1984 + resnum2=list(res2.get_id())[1]
1985 + if resnum2-resnum1 < 4 :
1986 + continue
1987 + p_p=np.nan
1988 + p_c4p=np.nan
1989 + p_c1p=np.nan
1990 + c4p_c4p=np.nan
1991 + c1p_c1p=np.nan
1992 + #res2=chain[j]
1993 + if res2.get_resname() in ["A", "C", "G", "U"]:
1994 +
1995 + atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"]
1996 + atom_c1p_2 = [ atom.get_coord() for atom in res2 if "C1'" in atom.get_fullname() ]
1997 + atom_c4p_2 = [ atom.get_coord() for atom in res2 if "C4'" in atom.get_fullname() ]
1998 +
1999 + p_p = get_euclidian_distance(atom_p_1, atom_p_2)
2000 + p_c4p= get_euclidian_distance(atom_p_1, atom_c4p_2)
2001 + p_c1p= get_euclidian_distance(atom_p_1, atom_c1p_2)
2002 + c4p_c4p= get_euclidian_distance(atom_c4p_1, atom_c4p_2)
2003 + c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)
2004 +
2005 + liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])
2006 + '''
2007 + for x in range(len(classes)):
2008 + if classes[x][0] <= p_p <= classes[x][1]:
2009 + occur_p_p[x]=occur_p_p[x]+1
2010 + if classes[x][0] <= p_c4p <= classes[x][1]:
2011 + occur_p_c4[x]=occur_p_c4[x]+1
2012 + if classes[x][0] <= p_c1p <= classes[x][1]:
2013 + occur_p_c1[x]=occur_p_c1[x]+1
2014 + if classes[x][0] <= c4p_c4p <= classes[x][1]:
2015 + occur_c4_c4[x]=occur_c4_c4[x]+1
2016 + if classes[x][0] <= c1p_c1p <= classes[x][1]:
2017 + occur_c1_c1[x]=occur_c1_c1[x]+1
2018 + '''
2019 + #for x in range(len(classes)):
2020 + # for i in range(len(liste_dist)):
2021 + # if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
2022 + # occur_p_p[x]=occur_p_p[x]+1
2023 + # if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
2024 + # occur_p_c4[x]=occur_p_c4[x]+1
2025 + # if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
2026 + # occur_p_c1[x]=occur_p_c1[x]+1
2027 + # if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
2028 + # occur_c4_c4[x]=occur_c4_c4[x]+1
2029 + # if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
2030 + # occur_c1_c1[x]=occur_c1_c1[x]+1
2031 + #nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
2032 + #df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
2033 + # return df
2034 + # nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
2035 + # print(nb_occurs)
2036 + # return nb_occurs
2037 +
2038 +
2039 + df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
2040 + df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv")
2041 +
2042 +@trace_unhandled_exceptions
2043 +def count_occur_pyle_dist(fpath):
2044 +
2045 + global idxQueue
2046 + thr_idx = idxQueue.get()
2047 + setproctitle(f"Worker {thr_idx+1} : Extract occurences of {fpath}")
2048 +
2049 + liste=os.listdir(fpath)
2050 + pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing ", leave=False)
2051 + df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
2052 + occur_p_p=list(df["P-P"])
2053 + occur_p_c1=list(df["P-C1'"])
2054 + occur_p_c4=list(df["P-C4'"])
2055 + occur_c1_c1=list(df["C1'-C1'"])
2056 + occur_c4_c4=list(df["C4'-C4'"])
2057 + nb_occurs=[]
2058 + for f in range(len(liste)):
2059 + df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
2060 + # print(liste[f])
2061 + for k in range(df.shape[0]):
2062 + occur_p_p[k]=occur_p_p[k]+df["P-P"][k]
2063 + occur_p_c1[k]=occur_p_c1[k]+df["P-C1'"][k]
2064 + occur_p_c4[k]=occur_p_c4[k]+df["P-C4'"][k]
2065 + occur_c1_c1[k]=occur_c1_c1[k]+df["C1'-C1'"][k]
2066 + occur_c4_c4[k]=occur_c4_c4[k]+df["C4'-C4'"][k]
2067 + pbar.update(1)
2068 + nb_occurs=[occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4]
2069 + # return(list(df["classe"]), occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4)
2070 + df = pd.DataFrame(nb_occurs, columns=list(df["classe"]))
2071 +
2072 + df.to_csv(runDir + "/results/geometry/Pyle/classes_dist/occurences_dist.csv")
2073 + idxQueue.put(thr_idx) # replace the thread index in the queue
2074 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
2075 +
2076 +
2077 +@trace_unhandled_exceptions
1934 def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) : 2078 def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) :
1935 """ 2079 """
1936 Plot Gaussian-Mixture-Model (with or without histograms) 2080 Plot Gaussian-Mixture-Model (with or without histograms)
...@@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) ...@@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True)
2012 else: 2156 else:
2013 D = obs.ravel() 2157 D = obs.ravel()
2014 xmin = D.min() 2158 xmin = D.min()
2015 - xmax = min(10.0, D.max()) 2159 + #xmax = min(10.0, D.max())
2160 + xmax = D.max()
2016 x = np.linspace(xmin,xmax,1000) 2161 x = np.linspace(xmin,xmax,1000)
2017 colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green'] 2162 colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
2018 2163
...@@ -2372,7 +2517,7 @@ def gmm_wadley(): ...@@ -2372,7 +2517,7 @@ def gmm_wadley():
2372 plt.savefig("GMM_flat_angles_pyle_model.png") 2517 plt.savefig("GMM_flat_angles_pyle_model.png")
2373 plt.close() 2518 plt.close()
2374 2519
2375 - # Torsion anfles 2520 + # Torsion angles
2376 eta=[] 2521 eta=[]
2377 theta=[] 2522 theta=[]
2378 eta_prime=[] 2523 eta_prime=[]
...@@ -2420,6 +2565,58 @@ def gmm_wadley(): ...@@ -2420,6 +2565,58 @@ def gmm_wadley():
2420 os.chdir(runDir) 2565 os.chdir(runDir)
2421 setproctitle("GMM (Pyle model) finished") 2566 setproctitle("GMM (Pyle model) finished")
2422 2567
2568 +def gmm_pyle_type(ntpair, data):
2569 +
2570 + setproctitle(f"GMM (Pyle {ntpair} )")
2571 +
2572 + os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
2573 + os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
2574 +
2575 + p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
2576 + p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
2577 + p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
2578 + c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
2579 + c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
2580 + print(len(p_p))
2581 + # res2=list(data["resnum2"])
2582 + # res1=list(data["resnum1"])
2583 + # diff=[]
2584 + # for i in range(len(res1)):
2585 + # diff.append(res2[i]-res1[i])
2586 + # print(diff[:100])
2587 +
2588 + GMM_histo(p_p, f"Distance P-P between {ntpair} tips for {str(len(p_p))} values", toric=False, hist=False, col="cyan")
2589 + GMM_histo(p_c4p, f"Distance P-C4' between {ntpair} tips", toric=False, hist=False, col="tomato")
2590 + GMM_histo(p_c1p, f"Distance P-C1' between {ntpair} tips", toric=False, hist=False, col="goldenrod")
2591 + GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair} tips", toric=False, hist=False, col="magenta")
2592 + GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair} tips", toric=False, hist=False, col="black")
2593 + # GMM_histo(diff, f"Gap between {ntpair} tips", toric=False, hist=False, col="tomato")
2594 + plt.xlabel("Distance (Angströms)")
2595 +
2596 + # plt.xlabel("Number of residues")
2597 + plt.ylabel("Distance (Angströms)")
2598 + plt.title(f"GMM of distances for {ntpair} ", fontsize=10)
2599 +
2600 + # plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
2601 + plt.savefig(f"Distances_Pyle_{ntpair}.png" )
2602 + plt.close()
2603 + setproctitle(f"GMM (Pyle {ntpair} distances) finished")
2604 +
2605 +def gmm_pyle():
2606 +
2607 + setproctitle("GMM (Pyle model)")
2608 +
2609 + df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv"))
2610 +
2611 + # dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
2612 + data=df
2613 + if len(data):
2614 + for b1 in ['A','C','G','U']:
2615 + for b2 in ['A','C','G','U']:
2616 + thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
2617 + if len(thisbases):
2618 + gmm_pyle_type(b1+b2, thisbases)
2619 +
2423 @trace_unhandled_exceptions 2620 @trace_unhandled_exceptions
2424 def gmm_hrna(): 2621 def gmm_hrna():
2425 """ 2622 """
...@@ -2669,7 +2866,11 @@ def merge_jsons(): ...@@ -2669,7 +2866,11 @@ def merge_jsons():
2669 os.remove(f) 2866 os.remove(f)
2670 except FileNotFoundError: 2867 except FileNotFoundError:
2671 pass 2868 pass
2672 - 2869 +
2870 +@trace_unhandled_exceptions
2871 +def loop(f):
2872 + return pd.read_csv(f)
2873 +
2673 @trace_unhandled_exceptions 2874 @trace_unhandled_exceptions
2674 def concat_dataframes(fpath, outfilename): 2875 def concat_dataframes(fpath, outfilename):
2675 """ 2876 """
...@@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename): ...@@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename):
2682 2883
2683 liste = os.listdir(fpath) 2884 liste = os.listdir(fpath)
2684 pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False) 2885 pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
2685 - 2886 + df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine="python")
2686 - df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop())) 2887 + #df=Parallel(n_jobs=-1, verbose=20)(delayed(loop)(os.path.abspath(fpath+liste[f])) for f in range (len(liste)))
2888 + #except :
2889 + # print(liste[f])
2890 +
2687 pbar.update(1) 2891 pbar.update(1)
2688 for f in range(len(liste)): 2892 for f in range(len(liste)):
2689 - df = pd.read_csv(os.path.abspath(fpath + liste.pop())) 2893 + # try :
2894 + df = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine='python')
2895 + # except :
2896 + # print(liste[f])
2897 + # continue
2690 df_tot = pd.concat([df_tot, df], ignore_index=True) 2898 df_tot = pd.concat([df_tot, df], ignore_index=True)
2691 pbar.update(1) 2899 pbar.update(1)
2692 - 2900 + #df = pd.concat(df, ignore_index=True)
2901 + #pbar.update(1)
2902 + #df.to_csv(fpath + outfilename)
2693 df_tot.to_csv(fpath + outfilename) 2903 df_tot.to_csv(fpath + outfilename)
2694 idxQueue.put(thr_idx) # replace the thread index in the queue 2904 idxQueue.put(thr_idx) # replace the thread index in the queue
2695 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 2905 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
...@@ -2789,6 +2999,7 @@ if __name__ == "__main__": ...@@ -2789,6 +2999,7 @@ if __name__ == "__main__":
2789 elif opt=='--wadley': 2999 elif opt=='--wadley':
2790 DO_WADLEY_ANALYSIS = True 3000 DO_WADLEY_ANALYSIS = True
2791 os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True) 3001 os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True)
3002 + os.makedirs(runDir+"/results/geometry/Pyle/classes_dist/", exist_ok=True)
2792 os.makedirs(runDir+"/results/geometry/Pyle/angles/", exist_ok=True) 3003 os.makedirs(runDir+"/results/geometry/Pyle/angles/", exist_ok=True)
2793 os.makedirs(runDir+"/results/geometry/Pyle/pseudotorsions/", exist_ok=True) 3004 os.makedirs(runDir+"/results/geometry/Pyle/pseudotorsions/", exist_ok=True)
2794 os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True) 3005 os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True)
...@@ -2853,9 +3064,9 @@ if __name__ == "__main__": ...@@ -2853,9 +3064,9 @@ if __name__ == "__main__":
2853 joblist = [] 3064 joblist = []
2854 3065
2855 # Do eta/theta plots 3066 # Do eta/theta plots
2856 - if n_unmapped_chains and DO_WADLEY_ANALYSIS: 3067 + #if n_unmapped_chains and DO_WADLEY_ANALYSIS:
2857 - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) 3068 + # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
2858 - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) 3069 + # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
2859 3070
2860 # Do distance matrices for each family excl. LSU/SSU (will be processed later) 3071 # Do distance matrices for each family excl. LSU/SSU (will be processed later)
2861 if DO_AVG_DISTANCE_MATRIX: 3072 if DO_AVG_DISTANCE_MATRIX:
...@@ -2881,21 +3092,24 @@ if __name__ == "__main__": ...@@ -2881,21 +3092,24 @@ if __name__ == "__main__":
2881 3092
2882 3093
2883 # Do geometric measures on all chains 3094 # Do geometric measures on all chains
3095 + #print(liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv'))
2884 3096
2885 - # if n_unmapped_chains: 3097 + # print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
3098 + if n_unmapped_chains:
2886 # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) 3099 # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
2887 - # liste_struct = os.listdir(path_to_3D_data + "rna_only") 3100 + # liste_struct = os.listdir(path_to_3D_data + "rna_only")
3101 + liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')
2888 # if '4zdo_1_E.cif' in liste_struct: 3102 # if '4zdo_1_E.cif' in liste_struct:
2889 # liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now 3103 # liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
2890 # if '4zdp_1_E.cif' in liste_struct: 3104 # if '4zdp_1_E.cif' in liste_struct:
2891 # liste_struct.remove('4zdp_1_E.cif') 3105 # liste_struct.remove('4zdp_1_E.cif')
2892 - # for f in liste_struct: 3106 + for f in liste_struct:
2893 - # if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): 3107 + if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
2894 - # joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances 3108 + joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
2895 3109
2896 3110
2897 - # process_jobs(joblist) 3111 + process_jobs(joblist)
2898 - 3112 + #count_occur_pyle_dist(runDir + '/results/geometry/Pyle/classes_dist/')
2899 # Now process the memory-heavy tasks family by family 3113 # Now process the memory-heavy tasks family by family
2900 if DO_AVG_DISTANCE_MATRIX: 3114 if DO_AVG_DISTANCE_MATRIX:
2901 for f in LSU_set: 3115 for f in LSU_set:
...@@ -2913,12 +3127,13 @@ if __name__ == "__main__": ...@@ -2913,12 +3127,13 @@ if __name__ == "__main__":
2913 # per_chain_stats() # per chain base frequencies en basepair types 3127 # per_chain_stats() # per chain base frequencies en basepair types
2914 # seq_idty() # identity matrices from pre-computed .npy matrices 3128 # seq_idty() # identity matrices from pre-computed .npy matrices
2915 # stats_pairs() 3129 # stats_pairs()
2916 - 3130 + concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')
2917 if n_unmapped_chains: 3131 if n_unmapped_chains:
2918 # general_stats() 3132 # general_stats()
2919 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) 3133 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
2920 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) 3134 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
2921 - # joblist = [] 3135 + joblist = []
3136 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')))
2922 # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv'))) 3137 # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')))
2923 # if DO_HIRE_RNA_MEASURES: 3138 # if DO_HIRE_RNA_MEASURES:
2924 # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv'))) 3139 # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')))
...@@ -2935,9 +3150,10 @@ if __name__ == "__main__": ...@@ -2935,9 +3150,10 @@ if __name__ == "__main__":
2935 if DO_HIRE_RNA_MEASURES: 3150 if DO_HIRE_RNA_MEASURES:
2936 # joblist.append(Job(function=gmm_hrna, args=())) 3151 # joblist.append(Job(function=gmm_hrna, args=()))
2937 joblist.append(Job(function=gmm_hrna_basepairs, args=())) 3152 joblist.append(Job(function=gmm_hrna_basepairs, args=()))
2938 - if DO_WADLEY_ANALYSIS: 3153 + # if DO_WADLEY_ANALYSIS:
2939 - joblist.append(Job(function=gmm_wadley, args=())) 3154 + # joblist.append(Job(function=gmm_wadley, args=()))
3155 + # joblist.append(Job(function=gmm_pyle, args=()))
2940 if len(joblist): 3156 if len(joblist):
2941 process_jobs(joblist) 3157 process_jobs(joblist)
2942 - merge_jsons() 3158 + #merge_jsons()
2943 3159
......