Aglaé TABOT

Modified the function concatenate and added the function dist_atoms_hire_RNA

...@@ -1185,11 +1185,6 @@ def distance(coord1, coord2): ...@@ -1185,11 +1185,6 @@ def distance(coord1, coord2):
1185 """ 1185 """
1186 Returns the distance between two points using their coordinates (x, y, z) 1186 Returns the distance between two points using their coordinates (x, y, z)
1187 """ 1187 """
1188 - '''
1189 - if (coord1 == [] or coord2 == []) :
1190 - return None
1191 - else :
1192 - '''
1193 return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2) 1188 return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2)
1194 1189
1195 def pos_b1(res) : 1190 def pos_b1(res) :
...@@ -1278,6 +1273,9 @@ def pos_b2(res): ...@@ -1278,6 +1273,9 @@ def pos_b2(res):
1278 return coordb2 1273 return coordb2
1279 1274
1280 def dist_atoms(f): 1275 def dist_atoms(f):
1276 + '''
1277 + Measures the distance between atoms linked by covalent bonds
1278 + '''
1281 1279
1282 name=str.split(f,'.')[0] 1280 name=str.split(f,'.')[0]
1283 1281
...@@ -1603,16 +1601,100 @@ def dist_atoms(f): ...@@ -1603,16 +1601,100 @@ def dist_atoms(f):
1603 df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv') 1601 df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv')
1604 1602
1605 1603
1606 -def concatenate_dist(): 1604 +def concatenate(chemin, liste, filename):
1607 - liste_dist=os.listdir(runDir+"/results/distances") 1605 + '''
1608 - df_0=pd.read_csv(os.path.abspath(runDir + "/results/distances/" +liste_dist[0])) 1606 + Concatenates the dataframes of liste containing measures
1609 - del(liste_dist[0]) 1607 + and creates a new dataframe gathering all
1608 + '''
1609 + liste=os.listdir(runDir+chemin)
1610 + df_0=pd.read_csv(os.path.abspath(runDir + chemin + liste[0]))
1611 + del(liste[0])
1610 df_tot=df_0 1612 df_tot=df_0
1611 - for f in liste_dist: 1613 + for f in liste:
1612 - df=pd.read_csv(os.path.abspath(runDir + "/results/distances/" + f)) 1614 + df=pd.read_csv(os.path.abspath(runDir + chemin + f))
1613 df_tot=pd.concat([df_tot, df], ignore_index=True) 1615 df_tot=pd.concat([df_tot, df], ignore_index=True)
1614 1616
1615 - df_tot.to_csv(runDir+'/results/distances/' +'dist_atomes.csv') 1617 + df_tot.to_csv(runDir + chemin + filename)
1618 +
1619 +def dist_atoms_hire_RNA (f) :
1620 + '''
1621 + Measures the distance between the atoms of the HiRE-RNA model linked by covalent bonds
1622 + '''
1623 + name=str.split(f,'.')[0]
1624 + liste_dist=[]
1625 + last_c4p=[]
1626 + parser=MMCIFParser()
1627 + s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f))
1628 + chain = next(s[0].get_chains())
1629 + os.makedirs(runDir+"/results/distances_hRNA/", exist_ok=True)
1630 + for res in chain :
1631 + p_o5p=None
1632 + o5p_c5p=None
1633 + c5p_c4p=None
1634 + c4p_c1p=None
1635 + c1p_b1=None
1636 + b1_b2=None
1637 + last_c4p_p=np.nan
1638 +
1639 + if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] : #several phosphate groups, ignore
1640 + atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
1641 + atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ]
1642 + atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ]
1643 + atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
1644 + atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
1645 + atom_b1=pos_b1(res)#position b1 to be calculated, depending on the case
1646 + atom_b2=pos_b2(res)#position b2 to be calculated only for those with 2 cycles
1647 +
1648 +
1649 + if len(last_c4p)<1 or len(atom_p)<1 or f!= f_prec:#link with the previous residue in the chain
1650 + last_c4p_p=last_c4p_p
1651 + else :
1652 + if distance(last_c4p[0], atom_p[0])>5:
1653 + last_c4p_p=last_c4p_p
1654 + else:
1655 + last_c4p_p=distance(last_c4p[0], atom_p[0])
1656 +
1657 + if len(atom_p)<1 or len(atom_o5p)<1 :
1658 + p_o5p=p_o5p
1659 + else :
1660 + p_o5p=distance(atom_p[0], atom_o5p[0])
1661 +
1662 + if len(atom_c5p)<1 or len(atom_o5p)<1 :
1663 + o5p_c5p=o5p_c5p
1664 + else :
1665 + o5p_c5p=distance(atom_o5p[0], atom_c5p[0])
1666 +
1667 + if len(atom_c5p)<1 or len(atom_c4p)<1 :
1668 + c5p_c4p=c5p_c4p
1669 + else :
1670 + c5p_c4p=distance(atom_c5p[0], atom_c4p[0])
1671 +
1672 + if len(atom_c4p)<1 or len(atom_c1p)<1 :
1673 + c4p_c1p=c4p_c1p
1674 + else :
1675 + c4p_c1p=distance(atom_c4p[0], atom_c1p[0])
1676 +
1677 + if len(atom_c1p)<1 or len(atom_b1)<1 :
1678 + c1p_b1=c1p_b1
1679 + else :
1680 +
1681 + c1p_b1=distance(atom_c1p[0], atom_b1)
1682 +
1683 + if len(atom_b1)<1 or len(atom_b2)<1 :
1684 + b1_b2=b1_b2
1685 + else :
1686 +
1687 + b1_b2=distance(atom_b1, atom_b2)
1688 +
1689 + last_c4p=atom_c4p
1690 + f_prec=f
1691 +
1692 +
1693 + liste_dist.append([res.get_resname(), last_c4p_p, p_o5p, o5p_c5p, c5p_c4p, c4p_c1p, c1p_b1, b1_b2])
1694 + df=pd.DataFrame(liste_dist, columns=["Residu", "C4'-P", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2"])
1695 +
1696 + df.to_csv(runDir + '/results/distances_hRNA/' + 'dist_atoms_hire_RNA '+name+'.csv')
1697 +
1616 1698
1617 1699
1618 1700
...@@ -1672,7 +1754,7 @@ if __name__ == "__main__": ...@@ -1672,7 +1754,7 @@ if __name__ == "__main__":
1672 1754
1673 1755
1674 # Load mappings. famlist will contain only families with structures at this resolution threshold. 1756 # Load mappings. famlist will contain only families with structures at this resolution threshold.
1675 - 1757 + '''
1676 print("Loading mappings list...") 1758 print("Loading mappings list...")
1677 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 1759 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1678 conn.execute('pragma journal_mode=wal') 1760 conn.execute('pragma journal_mode=wal')
...@@ -1696,7 +1778,7 @@ if __name__ == "__main__": ...@@ -1696,7 +1778,7 @@ if __name__ == "__main__":
1696 print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") 1778 print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
1697 if len(ignored): 1779 if len(ignored):
1698 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') 1780 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
1699 - 1781 + '''
1700 if DELETE_OLD_DATA: 1782 if DELETE_OLD_DATA:
1701 for f in famlist: 1783 for f in famlist:
1702 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) 1784 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
...@@ -1714,7 +1796,7 @@ if __name__ == "__main__": ...@@ -1714,7 +1796,7 @@ if __name__ == "__main__":
1714 1796
1715 # Define the tasks 1797 # Define the tasks
1716 joblist = [] 1798 joblist = []
1717 - 1799 + '''
1718 if n_unmapped_chains and DO_WADLEY_ANALYSIS: 1800 if n_unmapped_chains and DO_WADLEY_ANALYSIS:
1719 joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) 1801 joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
1720 joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) 1802 joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
...@@ -1736,14 +1818,25 @@ if __name__ == "__main__": ...@@ -1736,14 +1818,25 @@ if __name__ == "__main__":
1736 joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database 1818 joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
1737 if f not in ignored: 1819 if f not in ignored:
1738 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database 1820 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
1739 - 1821 + '''
1740 #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0]) 1822 #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0])
1741 1823
1742 - 1824 + '''
1743 f_prec=os.listdir(path_to_3D_data + "rna_only")[0] 1825 f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
1744 #exit() 1826 #exit()
1745 for f in os.listdir(path_to_3D_data + "rna_only")[:100]: 1827 for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
1746 joblist.append(Job(function=dist_atoms, args=(f,))) 1828 joblist.append(Job(function=dist_atoms, args=(f,)))
1829 + '''
1830 +
1831 + #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
1832 + #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
1833 + #exit()
1834 + f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
1835 + for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
1836 + joblist.append(Job(function=dist_atoms, args=(f,)))
1837 + joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
1838 +
1839 +
1747 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1840 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1748 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) 1841 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
1749 1842
...@@ -1774,12 +1867,12 @@ if __name__ == "__main__": ...@@ -1774,12 +1867,12 @@ if __name__ == "__main__":
1774 print() 1867 print()
1775 print() 1868 print()
1776 1869
1777 - #concatenate_dist()
1778 - # finish the work after the parallel portions
1779 1870
1871 + # finish the work after the parallel portions
1872 + '''
1780 per_chain_stats() 1873 per_chain_stats()
1781 seq_idty() 1874 seq_idty()
1782 stats_pairs() 1875 stats_pairs()
1783 if n_unmapped_chains: 1876 if n_unmapped_chains:
1784 general_stats() 1877 general_stats()
1785 -
...\ No newline at end of file ...\ No newline at end of file
1878 + '''
...\ No newline at end of file ...\ No newline at end of file
......