Modified the function concatenate and added the function dist_atoms_hire_RNA

Aglaé TABOT
Commit 655573b348e0423ff75b2b8578d918fc5e0ceba9 655573b3 1 parent ca4600da
Showing 1 changed file with 113 additions and 20 deletions
statistics.py
--- a/statistics.py
View file @655573b
+++ b/statistics.py
View file @655573b
@@ -1185,11 +1185,6 @@ def distance(coord1, coord2):
     """
     Returns the distance between two points using their coordinates (x, y, z)
     """
-    '''
-    if (coord1 == [] or coord2 == []) : 
-        return None
-    else : 
-    '''
     return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2)
 def pos_b1(res) :
@@ -1278,6 +1273,9 @@ def pos_b2(res):
     return coordb2
 def dist_atoms(f):
+    '''
+    Measures the distance between atoms linked by covalent bonds
+    '''
     name=str.split(f,'.')[0]
@@ -1603,16 +1601,100 @@ def dist_atoms(f):
     df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv')
-def concatenate_dist():
+def concatenate(chemin, liste, filename):
-    liste_dist=os.listdir(runDir+"/results/distances")
+    '''
-    df_0=pd.read_csv(os.path.abspath(runDir + "/results/distances/" +liste_dist[0]))
+    Concatenates the dataframes of liste containing measures 
-    del(liste_dist[0])
+    and creates a new dataframe gathering all
+    '''
+    liste=os.listdir(runDir+chemin)
+    df_0=pd.read_csv(os.path.abspath(runDir + chemin + liste[0]))
+    del(liste[0])
     df_tot=df_0
-    for f in liste_dist:
+    for f in liste:
-        df=pd.read_csv(os.path.abspath(runDir + "/results/distances/" + f))
+        df=pd.read_csv(os.path.abspath(runDir + chemin + f))
         df_tot=pd.concat([df_tot, df], ignore_index=True)
-    df_tot.to_csv(runDir+'/results/distances/' +'dist_atomes.csv')
+    df_tot.to_csv(runDir + chemin + filename)
+
+def dist_atoms_hire_RNA (f) :
+    '''
+    Measures the distance between the atoms of the HiRE-RNA model linked by covalent bonds
+    '''
+    name=str.split(f,'.')[0]
+    liste_dist=[]
+    last_c4p=[]
+    parser=MMCIFParser()
+    s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
+    chain = next(s[0].get_chains())
+    os.makedirs(runDir+"/results/distances_hRNA/", exist_ok=True)
+    for res in chain :
+        p_o5p=None
+        o5p_c5p=None
+        c5p_c4p=None
+        c4p_c1p=None
+        c1p_b1=None
+        b1_b2=None
+        last_c4p_p=np.nan
+        
+        if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] : #several phosphate groups, ignore
+            atom_p = [ atom.get_coord() for atom in res if atom.get_name() ==  "P"]
+            atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ]
+            atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ]
+            atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
+            atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
+            atom_b1=pos_b1(res)#position b1 to be calculated, depending on the case
+            atom_b2=pos_b2(res)#position b2 to be calculated only for those with 2 cycles
+            
+
+            if len(last_c4p)<1 or len(atom_p)<1 or f!= f_prec:#link with the previous residue in the chain
+                last_c4p_p=last_c4p_p
+            else :
+                if distance(last_c4p[0], atom_p[0])>5:
+                    last_c4p_p=last_c4p_p
+                else:
+                    last_c4p_p=distance(last_c4p[0], atom_p[0])
+
+            if len(atom_p)<1 or len(atom_o5p)<1 :
+                p_o5p=p_o5p
+            else :
+                p_o5p=distance(atom_p[0], atom_o5p[0])
+            
+            if len(atom_c5p)<1 or len(atom_o5p)<1 :
+                o5p_c5p=o5p_c5p
+            else :
+                o5p_c5p=distance(atom_o5p[0], atom_c5p[0])
+
+            if len(atom_c5p)<1 or len(atom_c4p)<1 :
+                c5p_c4p=c5p_c4p
+            else :
+                c5p_c4p=distance(atom_c5p[0], atom_c4p[0])
+
+            if len(atom_c4p)<1 or len(atom_c1p)<1 :
+                c4p_c1p=c4p_c1p
+            else :
+                c4p_c1p=distance(atom_c4p[0], atom_c1p[0])
+
+            if len(atom_c1p)<1 or len(atom_b1)<1 :
+                c1p_b1=c1p_b1
+            else :
+               
+                c1p_b1=distance(atom_c1p[0], atom_b1)
+
+            if len(atom_b1)<1 or len(atom_b2)<1 :
+                b1_b2=b1_b2
+            else :
+
+                b1_b2=distance(atom_b1, atom_b2)
+            
+            last_c4p=atom_c4p
+            f_prec=f
+        
+
+            liste_dist.append([res.get_resname(), last_c4p_p, p_o5p, o5p_c5p, c5p_c4p, c4p_c1p, c1p_b1, b1_b2])
+    df=pd.DataFrame(liste_dist, columns=["Residu", "C4'-P", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2"])
+    
+    df.to_csv(runDir + '/results/distances_hRNA/' + 'dist_atoms_hire_RNA '+name+'.csv')
+    
@@ -1672,7 +1754,7 @@ if __name__ == "__main__":
     # Load mappings. famlist will contain only families with structures at this resolution threshold.
-    
+    '''
     print("Loading mappings list...")
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         conn.execute('pragma journal_mode=wal')
@@ -1696,7 +1778,7 @@ if __name__ == "__main__":
     print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
     if len(ignored):
         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
-    
+    '''
     if DELETE_OLD_DATA:
         for f in famlist:
             subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
@@ -1714,7 +1796,7 @@ if __name__ == "__main__":
     # Define the tasks
     joblist = []
-    
+    '''
     if n_unmapped_chains and DO_WADLEY_ANALYSIS:
         joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
         joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
@@ -1736,14 +1818,25 @@ if __name__ == "__main__":
         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
         if f not in ignored:
             joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
-    
+    '''
     #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0])
-    
+    '''
     f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
     #exit()
     for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
         joblist.append(Job(function=dist_atoms, args=(f,)))
+    '''
+    
+    #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
+    #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
+    #exit()
+    f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
+    for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
+        joblist.append(Job(function=dist_atoms, args=(f,)))
+        joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
+    
+    
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
@@ -1774,12 +1867,12 @@ if __name__ == "__main__":
     print()
     print()
-    #concatenate_dist()
-    # finish the work after the parallel portions
+    # finish the work after the parallel portions
+    '''
     per_chain_stats()
     seq_idty()
     stats_pairs()
     if n_unmapped_chains:
         general_stats()
-    
\ No newline at end of file
+    '''
\ No newline at end of file