Modified the function concatenate and added the function dist_atoms_hire_RNA

Aglaé TABOT
Commit 655573b348e0423ff75b2b8578d918fc5e0ceba9 655573b3 1 parent ca4600da
Showing 1 changed file with 113 additions and 20 deletions
statistics.py
--- a/statistics.py
View file @655573b
+++ b/statistics.py
View file @655573b
@@ -1185,11 +1185,6 @@ def distance(coord1, coord2):
     """
     Returns the distance between two points using their coordinates (x, y, z)
     """
-     '''
-     if (coord1 == [] or coord2 == []) : 
-         return None
-     else : 
-     '''
     return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2)
 
 def pos_b1(res) :
@@ -1278,6 +1273,9 @@ def pos_b2(res):
     return coordb2
 
 def dist_atoms(f):
+     '''
+     Measures the distance between atoms linked by covalent bonds
+     '''
     
     name=str.split(f,'.')[0]
 
@@ -1603,16 +1601,100 @@ def dist_atoms(f):
     df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv')
 
 
- def concatenate_dist():
-     liste_dist=os.listdir(runDir+"/results/distances")
-     df_0=pd.read_csv(os.path.abspath(runDir + "/results/distances/" +liste_dist[0]))
-     del(liste_dist[0])
+ def concatenate(chemin, liste, filename):
+     '''
+     Concatenates the dataframes of liste containing measures 
+     and creates a new dataframe gathering all
+     '''
+     liste=os.listdir(runDir+chemin)
+     df_0=pd.read_csv(os.path.abspath(runDir + chemin + liste[0]))
+     del(liste[0])
     df_tot=df_0
-     for f in liste_dist:
-         df=pd.read_csv(os.path.abspath(runDir + "/results/distances/" + f))
+     for f in liste:
+         df=pd.read_csv(os.path.abspath(runDir + chemin + f))
         df_tot=pd.concat([df_tot, df], ignore_index=True)
     
-     df_tot.to_csv(runDir+'/results/distances/' +'dist_atomes.csv')
+     df_tot.to_csv(runDir + chemin + filename)
+ 
+ def dist_atoms_hire_RNA (f) :
+     '''
+     Measures the distance between the atoms of the HiRE-RNA model linked by covalent bonds
+     '''
+     name=str.split(f,'.')[0]
+     liste_dist=[]
+     last_c4p=[]
+     parser=MMCIFParser()
+     s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
+     chain = next(s[0].get_chains())
+     os.makedirs(runDir+"/results/distances_hRNA/", exist_ok=True)
+     for res in chain :
+         p_o5p=None
+         o5p_c5p=None
+         c5p_c4p=None
+         c4p_c1p=None
+         c1p_b1=None
+         b1_b2=None
+         last_c4p_p=np.nan
+         
+         if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] : #several phosphate groups, ignore
+             atom_p = [ atom.get_coord() for atom in res if atom.get_name() ==  "P"]
+             atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ]
+             atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ]
+             atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
+             atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
+             atom_b1=pos_b1(res)#position b1 to be calculated, depending on the case
+             atom_b2=pos_b2(res)#position b2 to be calculated only for those with 2 cycles
+             
+ 
+             if len(last_c4p)<1 or len(atom_p)<1 or f!= f_prec:#link with the previous residue in the chain
+                 last_c4p_p=last_c4p_p
+             else :
+                 if distance(last_c4p[0], atom_p[0])>5:
+                     last_c4p_p=last_c4p_p
+                 else:
+                     last_c4p_p=distance(last_c4p[0], atom_p[0])
+ 
+             if len(atom_p)<1 or len(atom_o5p)<1 :
+                 p_o5p=p_o5p
+             else :
+                 p_o5p=distance(atom_p[0], atom_o5p[0])
+             
+             if len(atom_c5p)<1 or len(atom_o5p)<1 :
+                 o5p_c5p=o5p_c5p
+             else :
+                 o5p_c5p=distance(atom_o5p[0], atom_c5p[0])
+ 
+             if len(atom_c5p)<1 or len(atom_c4p)<1 :
+                 c5p_c4p=c5p_c4p
+             else :
+                 c5p_c4p=distance(atom_c5p[0], atom_c4p[0])
+ 
+             if len(atom_c4p)<1 or len(atom_c1p)<1 :
+                 c4p_c1p=c4p_c1p
+             else :
+                 c4p_c1p=distance(atom_c4p[0], atom_c1p[0])
+ 
+             if len(atom_c1p)<1 or len(atom_b1)<1 :
+                 c1p_b1=c1p_b1
+             else :
+                
+                 c1p_b1=distance(atom_c1p[0], atom_b1)
+ 
+             if len(atom_b1)<1 or len(atom_b2)<1 :
+                 b1_b2=b1_b2
+             else :
+ 
+                 b1_b2=distance(atom_b1, atom_b2)
+             
+             last_c4p=atom_c4p
+             f_prec=f
+         
+ 
+             liste_dist.append([res.get_resname(), last_c4p_p, p_o5p, o5p_c5p, c5p_c4p, c4p_c1p, c1p_b1, b1_b2])
+     df=pd.DataFrame(liste_dist, columns=["Residu", "C4'-P", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2"])
+     
+     df.to_csv(runDir + '/results/distances_hRNA/' + 'dist_atoms_hire_RNA '+name+'.csv')
+     
 
 
 
@@ -1672,7 +1754,7 @@ if __name__ == "__main__":
     
 
     # Load mappings. famlist will contain only families with structures at this resolution threshold.
-     
+     '''
     print("Loading mappings list...")
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         conn.execute('pragma journal_mode=wal')
@@ -1696,7 +1778,7 @@ if __name__ == "__main__":
     print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
     if len(ignored):
         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
-     
+     '''
     if DELETE_OLD_DATA:
         for f in famlist:
             subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
@@ -1714,7 +1796,7 @@ if __name__ == "__main__":
 
     # Define the tasks
     joblist = []
-     
+     '''
     if n_unmapped_chains and DO_WADLEY_ANALYSIS:
         joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
         joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
@@ -1736,14 +1818,25 @@ if __name__ == "__main__":
         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
         if f not in ignored:
             joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
-     
+     '''
     #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0])
     
-     
+     '''
     f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
     #exit()
     for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
         joblist.append(Job(function=dist_atoms, args=(f,)))
+     '''
+     
+     #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
+     #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
+     #exit()
+     f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
+     for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
+         joblist.append(Job(function=dist_atoms, args=(f,)))
+         joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
+     
+     
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
 
@@ -1774,12 +1867,12 @@ if __name__ == "__main__":
     print()
     print()
 
-     #concatenate_dist()
-     # finish the work after the parallel portions
     
+     # finish the work after the parallel portions
+     '''
     per_chain_stats()
     seq_idty()
     stats_pairs()
     if n_unmapped_chains:
         general_stats()
-     
\ No newline at end of file
+     '''
\ No newline at end of file