Merge branch 'stage_aglae'

Louis BECQUEY · Louis BECQUEY
Commit 2259ea138d29ad8bb1f29e9013bf348341022a96 2259ea13 2 parents 1f779e7a 0f277add
Showing 2 changed files with 258 additions and 37 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @2259ea1
+++ b/RNAnet.py
View file @2259ea1
@@ -159,14 +159,14 @@ class SelectivePortionSelector(object):
 _select=Select()
 def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
+    # reuse and modification of the source code of Biopython
+    # to have the 2 columns of numbering of residues numbered with the index_chain of DSSR
     if isinstance(out_file, str):
         fp = open(out_file, "w")
         close_file = True
     else:
         fp = out_file
         close_file = False
-    #fp = open(out_file, "w")
-    #close_file=True
     atom_dict = defaultdict(list)
     for model in ioobj.structure.get_list():
@@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
             chain_id = chain.get_id()
             if chain_id == " ":
                 chain_id = "."
-            # This is used to write label_seq_id and increments from 1,
+            # This is used to write label_seq_id,
             # remaining blank for hetero residues
             prev_residue_type = ""
@@ -402,8 +402,9 @@ class Chain:
                         resseq=int(resseq)
                         index_chain=nums.at[i, "index_chain"]
                         nt=nums.at[i, "nt_name"]
-                        
+
-                        if nt == 'A' or nt == 'G' or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' :
+                        # particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G)
+                        if nt == 'A' or (nt == 'G' and (self.chain_label != '6n5s_1_A' or resseq != 201)) or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' :
                             res=chain[(' ', resseq, icode_res)]
                         else : #modified nucleotides (e.g. chain 5l4o_1_A)
                             het='H_' + nt
@@ -421,7 +422,15 @@ class Chain:
                         res_atoms=res.get_atoms()
                         new_residu_t=pdb.Residue.Residue(res_id, res_name, res.get_segid())
                         for atom in list(res.get_atoms()):
-                            if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A']:
+                            # rename the remaining phosphate group to P, OP1, OP2, OP3
+                            if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A'] and res_name != 'RIA': 
+
+                            # RIA is a residue made up of 2 riboses and 2 phosphates, 
+                            # so it has an O2A atom between the C2A and C1 'atoms, 
+                            # and it also has an OP2 atom attached to one of its phosphates 
+                            # (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A)
+                            # we do not modify the atom names of RIA residue
+
                                 if atom.get_name() == 'PA':
                                     atom_name = 'P'
                                 if atom.get_name() == 'O1A':
@@ -1511,11 +1520,7 @@ class Pipeline:
             if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
                 # for the portions mapped to Rfam
                 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
-            '''
+
-            if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
-                # extract chains of pure RNA
-                os.makedirs(path_to_3D_data + "rna_only")
-            '''
             if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
                 # extract chains of pure RNA
                 os.makedirs(path_to_3D_data + "rna_only")
--- a/statistics.py
View file @2259ea1
+++ b/statistics.py
View file @2259ea1
@@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no
 from sklearn.mixture import GaussianMixture
 import warnings
 from pandas.core.common import SettingWithCopyWarning
+from joblib import Parallel, delayed
 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
@@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms):
                     result.append(res)
     return(result)
+def liste_repres(fpath):
+    repres=[]
+    df=pd.read_csv(os.path.abspath(fpath))
+    for i in range(df.shape[0]):
+        up_name=df["representative"][i]
+        if '+' in up_name:
+            up_name=up_name.split('+')
+            for i in range(len(up_name)):
+                chain=up_name[i].split('|')
+                chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2]
+                repres.append(chain+'.cif')
+        else :
+            up_name=up_name.split('|')
+            low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2]
+            repres.append(low_name+'.cif') 
+        
+    return repres
+
+
 def get_euclidian_distance(L1, L2):
     """
     Returns the distance between two points (coordinates in lists)
@@ -1504,14 +1524,16 @@ def measure_from_structure(f):
         warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
         parser=MMCIFParser()
         s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
-
+    
-    measures_aa(name, s, thr_idx)
+    #pyle_measures(name, s, thr_idx)
+    #measures_aa(name, s, thr_idx)
     if DO_HIRE_RNA_MEASURES:
         measures_hrna(name, s, thr_idx)
         measures_hrna_basepairs(name, s, thr_idx)
     if DO_WADLEY_ANALYSIS:
-        measures_wadley(name, s, thr_idx)
+        #measures_wadley(name, s, thr_idx)
-
+        pyle_measures(name, s, thr_idx)
+    
     idxQueue.put(thr_idx) # replace the thread index in the queue
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
@@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx):
     return results
 @trace_unhandled_exceptions
+def pyle_measures(name, s, thr_idx):
+
+    if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')):
+        return
+
+    liste_dist=[]
+    #classes=[]
+    #for i in range(0, 150, 5):
+        #classes.append([i, i+5])
+    #classes.append([150, 300])
+    #occur_p_p=len(classes)*[0]
+    #occur_p_c1=len(classes)*[0]
+    #occur_p_c4=len(classes)*[0]
+    #occur_c1_c1=len(classes)*[0]
+    #occur_c4_c4=len(classes)*[0]
+    #nb_occurs=[]
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")
+
+    chain = next(s[0].get_chains())
+    #residues=list(chain.get_residues())
+    for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
+        #res1=chain[i]
+        if res1.get_resname() in ["A", "C", "G", "U"]:
+            resnum1=list(res1.get_id())[1]
+            atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() ==  "P"]
+            atom_c1p_1 = [ atom.get_coord() for atom in res1 if "C1'" in atom.get_fullname() ]
+            atom_c4p_1 = [ atom.get_coord() for atom in res1 if "C4'" in atom.get_fullname() ]
+            for res2 in chain:
+                resnum2=list(res2.get_id())[1]
+                if resnum2-resnum1 < 4 :
+                    continue
+                p_p=np.nan
+                p_c4p=np.nan
+                p_c1p=np.nan
+                c4p_c4p=np.nan
+                c1p_c1p=np.nan
+                #res2=chain[j]
+                if res2.get_resname() in ["A", "C", "G", "U"]:
+                    
+                    atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() ==  "P"]
+                    atom_c1p_2 = [ atom.get_coord() for atom in res2 if "C1'" in atom.get_fullname() ]
+                    atom_c4p_2 = [ atom.get_coord() for atom in res2 if "C4'" in atom.get_fullname() ]
+
+                    p_p = get_euclidian_distance(atom_p_1, atom_p_2)
+                    p_c4p= get_euclidian_distance(atom_p_1, atom_c4p_2)
+                    p_c1p= get_euclidian_distance(atom_p_1, atom_c1p_2)
+                    c4p_c4p= get_euclidian_distance(atom_c4p_1, atom_c4p_2)
+                    c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)
+
+                    liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])
+                    '''
+                    for x in range(len(classes)):
+                        if classes[x][0] <= p_p <= classes[x][1]:
+                            occur_p_p[x]=occur_p_p[x]+1
+                        if classes[x][0] <= p_c4p <= classes[x][1]:
+                            occur_p_c4[x]=occur_p_c4[x]+1                        
+                        if classes[x][0] <= p_c1p <= classes[x][1]:
+                            occur_p_c1[x]=occur_p_c1[x]+1
+                        if classes[x][0] <= c4p_c4p <= classes[x][1]:
+                            occur_c4_c4[x]=occur_c4_c4[x]+1
+                        if classes[x][0] <= c1p_c1p <= classes[x][1]:
+                            occur_c1_c1[x]=occur_c1_c1[x]+1
+                    '''
+    #for x in range(len(classes)):
+        # for i in range(len(liste_dist)):
+        #     if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
+        #         occur_p_p[x]=occur_p_p[x]+1
+        #     if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
+        #         occur_p_c4[x]=occur_p_c4[x]+1                        
+        #     if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
+        #         occur_p_c1[x]=occur_p_c1[x]+1
+        #     if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
+        #         occur_c4_c4[x]=occur_c4_c4[x]+1
+        #     if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
+        #         occur_c1_c1[x]=occur_c1_c1[x]+1
+        #nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
+    #df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
+    # return df
+    # nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
+    # print(nb_occurs)
+    # return nb_occurs
+                    
+
+    df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
+    df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv")
+
+@trace_unhandled_exceptions
+def count_occur_pyle_dist(fpath):
+    
+    global idxQueue
+    thr_idx = idxQueue.get()
+    setproctitle(f"Worker {thr_idx+1} : Extract occurences of {fpath}")    
+
+    liste=os.listdir(fpath)
+    pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing ", leave=False)
+    df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
+    occur_p_p=list(df["P-P"])
+    occur_p_c1=list(df["P-C1'"])
+    occur_p_c4=list(df["P-C4'"])
+    occur_c1_c1=list(df["C1'-C1'"])
+    occur_c4_c4=list(df["C4'-C4'"])
+    nb_occurs=[]
+    for f in range(len(liste)):
+        df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
+        # print(liste[f])
+        for k in range(df.shape[0]):
+            occur_p_p[k]=occur_p_p[k]+df["P-P"][k]
+            occur_p_c1[k]=occur_p_c1[k]+df["P-C1'"][k]
+            occur_p_c4[k]=occur_p_c4[k]+df["P-C4'"][k]
+            occur_c1_c1[k]=occur_c1_c1[k]+df["C1'-C1'"][k]
+            occur_c4_c4[k]=occur_c4_c4[k]+df["C4'-C4'"][k]
+        pbar.update(1)
+    nb_occurs=[occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4]
+    # return(list(df["classe"]), occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4)
+    df = pd.DataFrame(nb_occurs, columns=list(df["classe"]))
+
+    df.to_csv(runDir + "/results/geometry/Pyle/classes_dist/occurences_dist.csv")
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+
+
+@trace_unhandled_exceptions
 def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) :
     """
     Plot Gaussian-Mixture-Model (with or without histograms)
@@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) 
     else:
         D = obs.ravel()
         xmin = D.min()
-        xmax = min(10.0, D.max())
+        #xmax = min(10.0, D.max())
+        xmax = D.max()
         x = np.linspace(xmin,xmax,1000)
     colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
@@ -2372,7 +2517,7 @@ def gmm_wadley():
     plt.savefig("GMM_flat_angles_pyle_model.png")
     plt.close()
-    # Torsion anfles    
+    # Torsion angles    
     eta=[]
     theta=[]
     eta_prime=[]
@@ -2420,6 +2565,58 @@ def gmm_wadley():
     os.chdir(runDir)
     setproctitle("GMM (Pyle model) finished")   
+def gmm_pyle_type(ntpair, data):
+
+    setproctitle(f"GMM (Pyle {ntpair} )")
+    
+    os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
+    os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
+
+    p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
+    p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
+    p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
+    c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
+    c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
+    print(len(p_p))
+    # res2=list(data["resnum2"])
+    # res1=list(data["resnum1"])
+    # diff=[]
+    # for i in range(len(res1)):
+    #     diff.append(res2[i]-res1[i])
+    # print(diff[:100])
+    
+    GMM_histo(p_p, f"Distance P-P between {ntpair} tips for {str(len(p_p))} values", toric=False, hist=False, col="cyan")
+    GMM_histo(p_c4p, f"Distance P-C4' between {ntpair} tips", toric=False, hist=False, col="tomato")
+    GMM_histo(p_c1p, f"Distance P-C1' between {ntpair} tips", toric=False, hist=False, col="goldenrod")
+    GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair} tips", toric=False, hist=False, col="magenta")
+    GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair} tips", toric=False, hist=False, col="black")
+    # GMM_histo(diff, f"Gap between {ntpair} tips", toric=False, hist=False, col="tomato")
+    plt.xlabel("Distance (Angströms)")
+    
+    # plt.xlabel("Number of residues")
+    plt.ylabel("Distance (Angströms)")
+    plt.title(f"GMM of distances for {ntpair} ", fontsize=10)
+
+    # plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
+    plt.savefig(f"Distances_Pyle_{ntpair}.png" )
+    plt.close()
+    setproctitle(f"GMM (Pyle {ntpair} distances) finished")
+
+def gmm_pyle():
+
+    setproctitle("GMM (Pyle model)")
+
+    df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv"))
+
+    # dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
+    data=df
+    if len(data):
+        for b1 in ['A','C','G','U']:
+            for b2 in ['A','C','G','U']:
+                thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
+                if len(thisbases):
+                    gmm_pyle_type(b1+b2, thisbases)
+
 @trace_unhandled_exceptions
 def gmm_hrna():
     """
@@ -2669,7 +2866,11 @@ def merge_jsons():
             os.remove(f)
         except FileNotFoundError:
             pass
-   
+
+@trace_unhandled_exceptions
+def loop(f):
+    return pd.read_csv(f)
+
 @trace_unhandled_exceptions
 def concat_dataframes(fpath, outfilename):
     """
@@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename):
     liste = os.listdir(fpath)
     pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
-
+    df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine="python")
-    df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
+    #df=Parallel(n_jobs=-1, verbose=20)(delayed(loop)(os.path.abspath(fpath+liste[f])) for f in range (len(liste)))
+    #except :
+        # print(liste[f])
+         
     pbar.update(1)
     for f in range(len(liste)):
-        df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
+    #     try :
+        df = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine='python')
+    #     except :
+    #         print(liste[f])
+    #         continue
         df_tot = pd.concat([df_tot, df], ignore_index=True)
         pbar.update(1)
-
+    #df = pd.concat(df, ignore_index=True)
+    #pbar.update(1)
+    #df.to_csv(fpath + outfilename)
     df_tot.to_csv(fpath + outfilename)
     idxQueue.put(thr_idx) # replace the thread index in the queue
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
@@ -2789,6 +2999,7 @@ if __name__ == "__main__":
         elif opt=='--wadley':
             DO_WADLEY_ANALYSIS = True
             os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True)
+            os.makedirs(runDir+"/results/geometry/Pyle/classes_dist/", exist_ok=True)
             os.makedirs(runDir+"/results/geometry/Pyle/angles/", exist_ok=True)
             os.makedirs(runDir+"/results/geometry/Pyle/pseudotorsions/", exist_ok=True)
             os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True)
@@ -2853,9 +3064,9 @@ if __name__ == "__main__":
     joblist = []
     # Do eta/theta plots
-    if n_unmapped_chains and DO_WADLEY_ANALYSIS:    
+    #if n_unmapped_chains and DO_WADLEY_ANALYSIS:    
-        joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
+    #    joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
-        joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
+    #    joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
     # Do distance matrices for each family excl. LSU/SSU (will be processed later)
     if DO_AVG_DISTANCE_MATRIX:  
@@ -2881,21 +3092,24 @@ if __name__ == "__main__":
     # Do geometric measures on all chains
+    #print(liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv'))
-    # if n_unmapped_chains:
+    # print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
+    if n_unmapped_chains:
     #     os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
-    #     liste_struct = os.listdir(path_to_3D_data + "rna_only")
+    #    liste_struct = os.listdir(path_to_3D_data + "rna_only")
+        liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')
     #     if '4zdo_1_E.cif' in liste_struct:
     #         liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
     #     if '4zdp_1_E.cif' in liste_struct:
     #         liste_struct.remove('4zdp_1_E.cif')
-    #     for f in liste_struct:
+        for f in liste_struct:
-    #         if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
+            if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
-    #             joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers))   # All-atom distances
+                joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers))   # All-atom distances
-    # process_jobs(joblist)
+    process_jobs(joblist)
-
+    #count_occur_pyle_dist(runDir + '/results/geometry/Pyle/classes_dist/')
     # Now process the memory-heavy tasks family by family
     if DO_AVG_DISTANCE_MATRIX:
         for f in LSU_set:
@@ -2913,12 +3127,13 @@ if __name__ == "__main__":
     # per_chain_stats()   # per chain base frequencies en basepair types
     # seq_idty()          # identity matrices from pre-computed .npy matrices
     # stats_pairs()
-    
+    concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')
     if n_unmapped_chains:
         # general_stats()
         os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
         os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
-        # joblist = []
+        joblist = []
+        # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')))
         # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')))
         # if DO_HIRE_RNA_MEASURES:
         #     joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')))
@@ -2935,9 +3150,10 @@ if __name__ == "__main__":
         if DO_HIRE_RNA_MEASURES:
         #     joblist.append(Job(function=gmm_hrna, args=()))
             joblist.append(Job(function=gmm_hrna_basepairs, args=()))
-        if DO_WADLEY_ANALYSIS:
+        # if DO_WADLEY_ANALYSIS:
-            joblist.append(Job(function=gmm_wadley, args=()))
+        #    joblist.append(Job(function=gmm_wadley, args=()))
+            # joblist.append(Job(function=gmm_pyle, args=()))
         if len(joblist):
             process_jobs(joblist)
-        merge_jsons()
+        #merge_jsons()