Modified the function concatenate and added the function dist_atoms_hire_RNA
Showing
1 changed file
with
113 additions
and
20 deletions
... | @@ -1185,11 +1185,6 @@ def distance(coord1, coord2): | ... | @@ -1185,11 +1185,6 @@ def distance(coord1, coord2): |
1185 | """ | 1185 | """ |
1186 | Returns the distance between two points using their coordinates (x, y, z) | 1186 | Returns the distance between two points using their coordinates (x, y, z) |
1187 | """ | 1187 | """ |
1188 | - ''' | ||
1189 | - if (coord1 == [] or coord2 == []) : | ||
1190 | - return None | ||
1191 | - else : | ||
1192 | - ''' | ||
1193 | return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2) | 1188 | return np.sqrt((coord1[0]-coord2[0])**2 + (coord1[1]-coord2[1])**2 + (coord1[2]-coord2[2])**2) |
1194 | 1189 | ||
1195 | def pos_b1(res) : | 1190 | def pos_b1(res) : |
... | @@ -1278,6 +1273,9 @@ def pos_b2(res): | ... | @@ -1278,6 +1273,9 @@ def pos_b2(res): |
1278 | return coordb2 | 1273 | return coordb2 |
1279 | 1274 | ||
1280 | def dist_atoms(f): | 1275 | def dist_atoms(f): |
1276 | + ''' | ||
1277 | + Measures the distance between atoms linked by covalent bonds | ||
1278 | + ''' | ||
1281 | 1279 | ||
1282 | name=str.split(f,'.')[0] | 1280 | name=str.split(f,'.')[0] |
1283 | 1281 | ||
... | @@ -1603,16 +1601,100 @@ def dist_atoms(f): | ... | @@ -1603,16 +1601,100 @@ def dist_atoms(f): |
1603 | df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv') | 1601 | df.to_csv(runDir+"/results/distances/" +'dist_atoms '+name+'.csv') |
1604 | 1602 | ||
1605 | 1603 | ||
1606 | -def concatenate_dist(): | 1604 | +def concatenate(chemin, liste, filename): |
1607 | - liste_dist=os.listdir(runDir+"/results/distances") | 1605 | + ''' |
1608 | - df_0=pd.read_csv(os.path.abspath(runDir + "/results/distances/" +liste_dist[0])) | 1606 | + Concatenates the dataframes of liste containing measures |
1609 | - del(liste_dist[0]) | 1607 | + and creates a new dataframe gathering all |
1608 | + ''' | ||
1609 | + liste=os.listdir(runDir+chemin) | ||
1610 | + df_0=pd.read_csv(os.path.abspath(runDir + chemin + liste[0])) | ||
1611 | + del(liste[0]) | ||
1610 | df_tot=df_0 | 1612 | df_tot=df_0 |
1611 | - for f in liste_dist: | 1613 | + for f in liste: |
1612 | - df=pd.read_csv(os.path.abspath(runDir + "/results/distances/" + f)) | 1614 | + df=pd.read_csv(os.path.abspath(runDir + chemin + f)) |
1613 | df_tot=pd.concat([df_tot, df], ignore_index=True) | 1615 | df_tot=pd.concat([df_tot, df], ignore_index=True) |
1614 | 1616 | ||
1615 | - df_tot.to_csv(runDir+'/results/distances/' +'dist_atomes.csv') | 1617 | + df_tot.to_csv(runDir + chemin + filename) |
1618 | + | ||
1619 | +def dist_atoms_hire_RNA (f) : | ||
1620 | + ''' | ||
1621 | + Measures the distance between the atoms of the HiRE-RNA model linked by covalent bonds | ||
1622 | + ''' | ||
1623 | + name=str.split(f,'.')[0] | ||
1624 | + liste_dist=[] | ||
1625 | + last_c4p=[] | ||
1626 | + parser=MMCIFParser() | ||
1627 | + s = parser.get_structure(name, os.path.abspath("/home/data/RNA/3D/rna_only/" + f)) | ||
1628 | + chain = next(s[0].get_chains()) | ||
1629 | + os.makedirs(runDir+"/results/distances_hRNA/", exist_ok=True) | ||
1630 | + for res in chain : | ||
1631 | + p_o5p=None | ||
1632 | + o5p_c5p=None | ||
1633 | + c5p_c4p=None | ||
1634 | + c4p_c1p=None | ||
1635 | + c1p_b1=None | ||
1636 | + b1_b2=None | ||
1637 | + last_c4p_p=np.nan | ||
1638 | + | ||
1639 | + if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA'] : #several phosphate groups, ignore | ||
1640 | + atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"] | ||
1641 | + atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ] | ||
1642 | + atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ] | ||
1643 | + atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ] | ||
1644 | + atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ] | ||
1645 | + atom_b1=pos_b1(res)#position b1 to be calculated, depending on the case | ||
1646 | + atom_b2=pos_b2(res)#position b2 to be calculated only for those with 2 cycles | ||
1647 | + | ||
1648 | + | ||
1649 | + if len(last_c4p)<1 or len(atom_p)<1 or f!= f_prec:#link with the previous residue in the chain | ||
1650 | + last_c4p_p=last_c4p_p | ||
1651 | + else : | ||
1652 | + if distance(last_c4p[0], atom_p[0])>5: | ||
1653 | + last_c4p_p=last_c4p_p | ||
1654 | + else: | ||
1655 | + last_c4p_p=distance(last_c4p[0], atom_p[0]) | ||
1656 | + | ||
1657 | + if len(atom_p)<1 or len(atom_o5p)<1 : | ||
1658 | + p_o5p=p_o5p | ||
1659 | + else : | ||
1660 | + p_o5p=distance(atom_p[0], atom_o5p[0]) | ||
1661 | + | ||
1662 | + if len(atom_c5p)<1 or len(atom_o5p)<1 : | ||
1663 | + o5p_c5p=o5p_c5p | ||
1664 | + else : | ||
1665 | + o5p_c5p=distance(atom_o5p[0], atom_c5p[0]) | ||
1666 | + | ||
1667 | + if len(atom_c5p)<1 or len(atom_c4p)<1 : | ||
1668 | + c5p_c4p=c5p_c4p | ||
1669 | + else : | ||
1670 | + c5p_c4p=distance(atom_c5p[0], atom_c4p[0]) | ||
1671 | + | ||
1672 | + if len(atom_c4p)<1 or len(atom_c1p)<1 : | ||
1673 | + c4p_c1p=c4p_c1p | ||
1674 | + else : | ||
1675 | + c4p_c1p=distance(atom_c4p[0], atom_c1p[0]) | ||
1676 | + | ||
1677 | + if len(atom_c1p)<1 or len(atom_b1)<1 : | ||
1678 | + c1p_b1=c1p_b1 | ||
1679 | + else : | ||
1680 | + | ||
1681 | + c1p_b1=distance(atom_c1p[0], atom_b1) | ||
1682 | + | ||
1683 | + if len(atom_b1)<1 or len(atom_b2)<1 : | ||
1684 | + b1_b2=b1_b2 | ||
1685 | + else : | ||
1686 | + | ||
1687 | + b1_b2=distance(atom_b1, atom_b2) | ||
1688 | + | ||
1689 | + last_c4p=atom_c4p | ||
1690 | + f_prec=f | ||
1691 | + | ||
1692 | + | ||
1693 | + liste_dist.append([res.get_resname(), last_c4p_p, p_o5p, o5p_c5p, c5p_c4p, c4p_c1p, c1p_b1, b1_b2]) | ||
1694 | + df=pd.DataFrame(liste_dist, columns=["Residu", "C4'-P", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2"]) | ||
1695 | + | ||
1696 | + df.to_csv(runDir + '/results/distances_hRNA/' + 'dist_atoms_hire_RNA '+name+'.csv') | ||
1697 | + | ||
1616 | 1698 | ||
1617 | 1699 | ||
1618 | 1700 | ||
... | @@ -1672,7 +1754,7 @@ if __name__ == "__main__": | ... | @@ -1672,7 +1754,7 @@ if __name__ == "__main__": |
1672 | 1754 | ||
1673 | 1755 | ||
1674 | # Load mappings. famlist will contain only families with structures at this resolution threshold. | 1756 | # Load mappings. famlist will contain only families with structures at this resolution threshold. |
1675 | - | 1757 | + ''' |
1676 | print("Loading mappings list...") | 1758 | print("Loading mappings list...") |
1677 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 1759 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1678 | conn.execute('pragma journal_mode=wal') | 1760 | conn.execute('pragma journal_mode=wal') |
... | @@ -1696,7 +1778,7 @@ if __name__ == "__main__": | ... | @@ -1696,7 +1778,7 @@ if __name__ == "__main__": |
1696 | print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") | 1778 | print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") |
1697 | if len(ignored): | 1779 | if len(ignored): |
1698 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | 1780 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') |
1699 | - | 1781 | + ''' |
1700 | if DELETE_OLD_DATA: | 1782 | if DELETE_OLD_DATA: |
1701 | for f in famlist: | 1783 | for f in famlist: |
1702 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) | 1784 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) |
... | @@ -1714,7 +1796,7 @@ if __name__ == "__main__": | ... | @@ -1714,7 +1796,7 @@ if __name__ == "__main__": |
1714 | 1796 | ||
1715 | # Define the tasks | 1797 | # Define the tasks |
1716 | joblist = [] | 1798 | joblist = [] |
1717 | - | 1799 | + ''' |
1718 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 1800 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
1719 | joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) | 1801 | joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
1720 | joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) | 1802 | joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
... | @@ -1736,14 +1818,25 @@ if __name__ == "__main__": | ... | @@ -1736,14 +1818,25 @@ if __name__ == "__main__": |
1736 | joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database | 1818 | joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database |
1737 | if f not in ignored: | 1819 | if f not in ignored: |
1738 | joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database | 1820 | joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database |
1739 | - | 1821 | + ''' |
1740 | #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0]) | 1822 | #dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0]) |
1741 | 1823 | ||
1742 | - | 1824 | + ''' |
1743 | f_prec=os.listdir(path_to_3D_data + "rna_only")[0] | 1825 | f_prec=os.listdir(path_to_3D_data + "rna_only")[0] |
1744 | #exit() | 1826 | #exit() |
1745 | for f in os.listdir(path_to_3D_data + "rna_only")[:100]: | 1827 | for f in os.listdir(path_to_3D_data + "rna_only")[:100]: |
1746 | joblist.append(Job(function=dist_atoms, args=(f,))) | 1828 | joblist.append(Job(function=dist_atoms, args=(f,))) |
1829 | + ''' | ||
1830 | + | ||
1831 | + #dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0]) | ||
1832 | + #concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv') | ||
1833 | + #exit() | ||
1834 | + f_prec=os.listdir(path_to_3D_data + "rna_only")[0] | ||
1835 | + for f in os.listdir(path_to_3D_data + "rna_only")[:100]: | ||
1836 | + joblist.append(Job(function=dist_atoms, args=(f,))) | ||
1837 | + joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,))) | ||
1838 | + | ||
1839 | + | ||
1747 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1840 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) |
1748 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) | 1841 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) |
1749 | 1842 | ||
... | @@ -1774,12 +1867,12 @@ if __name__ == "__main__": | ... | @@ -1774,12 +1867,12 @@ if __name__ == "__main__": |
1774 | print() | 1867 | print() |
1775 | print() | 1868 | print() |
1776 | 1869 | ||
1777 | - #concatenate_dist() | ||
1778 | - # finish the work after the parallel portions | ||
1779 | 1870 | ||
1871 | + # finish the work after the parallel portions | ||
1872 | + ''' | ||
1780 | per_chain_stats() | 1873 | per_chain_stats() |
1781 | seq_idty() | 1874 | seq_idty() |
1782 | stats_pairs() | 1875 | stats_pairs() |
1783 | if n_unmapped_chains: | 1876 | if n_unmapped_chains: |
1784 | general_stats() | 1877 | general_stats() |
1785 | - | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
1878 | + ''' | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment