Aglaé TABOT

Adapted measures to renumbered 3D structures


Former-commit-id: e1bb72c6
...@@ -1462,11 +1462,12 @@ def measure_from_structure(f): ...@@ -1462,11 +1462,12 @@ def measure_from_structure(f):
1462 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) 1462 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
1463 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) 1463 warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
1464 parser=MMCIFParser() 1464 parser=MMCIFParser()
1465 - s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) 1465 + s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "renumbered_rna_only/" + f))
1466 1466
1467 measures_aa(name, s, thr_idx) 1467 measures_aa(name, s, thr_idx)
1468 if DO_HIRE_RNA_MEASURES: 1468 if DO_HIRE_RNA_MEASURES:
1469 measures_hrna(name, s, thr_idx) 1469 measures_hrna(name, s, thr_idx)
1470 + measures_hrna_basepairs(name, s, thr_idx)
1470 if DO_WADLEY_ANALYSIS: 1471 if DO_WADLEY_ANALYSIS:
1471 measures_wadley(name, s, thr_idx) 1472 measures_wadley(name, s, thr_idx)
1472 1473
...@@ -1793,58 +1794,31 @@ def measures_hrna(name, s, thr_idx): ...@@ -1793,58 +1794,31 @@ def measures_hrna(name, s, thr_idx):
1793 df.to_csv(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv') 1794 df.to_csv(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv')
1794 1795
1795 @trace_unhandled_exceptions 1796 @trace_unhandled_exceptions
1796 -def measure_hrna_basepairs(cle): 1797 +def measures_hrna_basepairs(name, s, thr_idx):
1797 """ 1798 """
1798 - Open a complete RNAcifs/ file, and run measure_hrna_basepairs_chain() on every chain 1799 + Open a renumbered_rna_only/ file, and run measures_hrna_basepairs_chain() on every chain
1799 """ 1800 """
1800 1801
1801 - global idxQueue 1802 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_hrna_basepairs({name})")
1802 - thr_idx = idxQueue.get()
1803 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_hrna_basepairs({cle})")
1804 1803
1805 - # Open the structure
1806 - with warnings.catch_warnings():
1807 - # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
1808 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
1809 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
1810 - parser=MMCIFParser()
1811 - s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "RNAcifs/" + cle + ".cif"))
1812 -
1813 l=[] 1804 l=[]
1814 - for model in s: 1805 + chain = next(s[0].get_chains())
1815 - for valeur in chain_list[cle]:
1816 1806
1817 - # do not recompute something already computed 1807 + # do not recompute something already computed
1818 - if path.isfile(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs "+cle+'_'+valeur+".csv"): 1808 + if path.isfile(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs "+name+".csv"):
1819 - continue 1809 + return
1820 1810
1821 - if len(valeur) > 2: # if several RNA chains in the same structure 1811 + df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + name))
1822 - df_tot=[] 1812 +
1823 - for id_chain in tqdm(valeur, desc=f"Worker {thr_idx+1}: Chains in {cle}", unit="chains", leave=False): 1813 + if df['index_chain'][0]==1:#ignore files with numbering errors
1824 - for data in ld: 1814 + l = measures_hrna_basepairs_chain(chain, df, thr_idx)
1825 - if (len(data)<10): #unmapped 1815 +
1826 - chaine=str.split(data, '_') 1816 + df_calc=pd.DataFrame(l, columns=["Chaine", "type LW", "Resseq", "Num paired", "Distance", "C4'-C1'-B1", "C1'-B1-B1pair", "B1-B1pair-C1'pair", "B1pair-C1'pair-C4'pair"])
1827 - if chaine[0]==cle and chaine[2]==id_chain: 1817 + df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs '+name+'.csv')
1828 - df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + data))
1829 - if df['index_chain'][0]==1:#ignore files with numbering errors
1830 - l = measure_hrna_basepairs_chain(model[id_chain], df, thr_idx)
1831 - else : # only one RNA chain
1832 - for data in ld:
1833 - if (len(data)<10): #unmapped
1834 - chaine=str.split(data, '_')
1835 - if chaine[0]==cle and chaine[2]==valeur:
1836 - df=pd.read_csv(os.path.abspath(path_to_3D_data + "datapoints/" + data))
1837 - if df['index_chain'][0]==1:
1838 - l = measure_hrna_basepairs_chain(model[valeur], df, thr_idx)
1839 -
1840 - df_calc=pd.DataFrame(l, columns=["Chaine", "type LW", "Resseq", "Num paired", "Distance", "C4'-C1'-B1", "C1'-B1-B1pair", "B1-B1pair-C1'pair", "B1pair-C1'pair-C4'pair"])
1841 - df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs '+cle+'_'+valeur+'.csv')
1842 1818
1843 - idxQueue.put(thr_idx) # replace the thread index in the queue
1844 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1845 1819
1846 @trace_unhandled_exceptions 1820 @trace_unhandled_exceptions
1847 -def measure_hrna_basepairs_chain(chain, df, thr_idx): 1821 +def measures_hrna_basepairs_chain(chain, df, thr_idx):
1848 """ 1822 """
1849 Cleanup of the dataset 1823 Cleanup of the dataset
1850 measurements of distances and angles between paired nucleotides in the chain 1824 measurements of distances and angles between paired nucleotides in the chain
...@@ -1888,74 +1862,30 @@ def measure_hrna_basepairs_chain(chain, df, thr_idx): ...@@ -1888,74 +1862,30 @@ def measure_hrna_basepairs_chain(chain, df, thr_idx):
1888 indexNames=pairs[pairs['paired_int'] == 0].index 1862 indexNames=pairs[pairs['paired_int'] == 0].index
1889 pairs.drop(indexNames, inplace=True)#deletion of lines with a 0 in paired_int (matching to another RNA chain) 1863 pairs.drop(indexNames, inplace=True)#deletion of lines with a 0 in paired_int (matching to another RNA chain)
1890 1864
1891 - for i in pairs.index: 1865 + for i in tqdm(pairs.index, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {chain} measures_hrna_basepairs_chain", unit="res", leave=False):
1892 """ 1866 """
1893 calculations for each row of the pairs dataset 1867 calculations for each row of the pairs dataset
1894 """ 1868 """
1895 - resseq=pairs.at[i, 'old_nt_resnum'] #number of the residue in the chain
1896 - #code to delete letters in old_nt_resnum
1897 - icode_res=' '
1898 - if type(resseq) is str:
1899 - if resseq[0] != '-' :
1900 - while resseq.isdigit() is False:
1901 - l=len(resseq)
1902 - if icode_res==' ':
1903 - icode_res=resseq[l-1]
1904 - else :
1905 - icode_res=resseq[l-1]+icode_res
1906 - resseq=resseq[:l-1]
1907 - resseq=int(resseq)
1908 index=pairs.at[i, 'index_chain'] 1869 index=pairs.at[i, 'index_chain']
1909 type_LW=pairs.at[i, 'pair_type_LW_bis'] #pairing type 1870 type_LW=pairs.at[i, 'pair_type_LW_bis'] #pairing type
1910 num_paired=pairs.at[i, 'paired_int'] #number (index_chain) of the paired nucleotide 1871 num_paired=pairs.at[i, 'paired_int'] #number (index_chain) of the paired nucleotide
1911 1872
1912 -
1913 if type(num_paired) is int or type(num_paired) is np.int64: 1873 if type(num_paired) is int or type(num_paired) is np.int64:
1914 - l=pairs[pairs['index_chain']==num_paired].index.to_list()
1915 -
1916 - resnum=pairs.at[l[0], 'old_nt_resnum']
1917 - icode_pair=' '
1918 - if type(resnum) is str:
1919 - if resnum[0] != '-' :
1920 - while resnum.isdigit() is False:
1921 - l=len(resnum)
1922 - if icode_pair==' ':
1923 - icode_pair=resnum[l-1]
1924 - else :
1925 - icode_pair=resnum[l-1]+icode_pair
1926 - resnum=resnum[:l-1]
1927 -
1928 - resnum=int(resnum)
1929 try : 1874 try :
1930 - d = basepair_apex_distance(chain[(' ',resseq, icode_res)], chain[(' ', resnum, icode_pair)]) # calculation of the distance between the tips of the paired nucleotides 1875 + d = basepair_apex_distance(chain[(' ',index, ' ')], chain[(' ', num_paired, ' ')])
1931 - angle = basepair_flat_angle(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) 1876 + angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired, ' ')])
1932 if d != 0.0: 1877 if d != 0.0:
1933 - liste_dist.append([chain, type_LW, resseq, resnum, d, angle[0], angle[1], angle[2], angle[3]]) 1878 + liste_dist.append([chain, type_LW, index, num_paired, d, angle[0], angle[1], angle[2], angle[3]])
1934 except : 1879 except :
1935 pass 1880 pass
1936 else : 1881 else :
1937 for j in range(len(num_paired)): #if several pairings, process them one by one 1882 for j in range(len(num_paired)): #if several pairings, process them one by one
1938 if num_paired[j] != 0 : 1883 if num_paired[j] != 0 :
1939 - l=pairs[pairs['index_chain']==num_paired[j]].index.to_list()
1940 -
1941 - resnum=pairs.at[l[0], 'old_nt_resnum']
1942 -
1943 - icode_pair=' '
1944 - if type(resnum) is str:
1945 - if resnum[0] != '-' :
1946 - while resnum.isdigit() is False:
1947 - l=len(resnum)
1948 - if icode_pair==' ':
1949 - icode_pair=resnum[l-1]
1950 - else :
1951 - icode_pair=resnum[l-1]+icode_pair
1952 - resnum=resnum[:l-1]
1953 - resnum=int(resnum)
1954 try : 1884 try :
1955 - d = basepair_apex_distance(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) 1885 + d = basepair_apex_distance(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')])
1956 - angle = basepair_flat_angle(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) 1886 + angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')])
1957 if d != 0.0: 1887 if d != 0.0:
1958 - liste_dist.append([chain, type_LW[j], resseq, resnum, d, angle[0], angle[1], angle[2], angle[3]]) 1888 + liste_dist.append([chain, type_LW[j], index, num_paired[j], d, angle[0], angle[1], angle[2], angle[3]])
1959 except: 1889 except:
1960 pass 1890 pass
1961 1891
...@@ -2759,7 +2689,7 @@ def gmm_hrna_basepairs(): ...@@ -2759,7 +2689,7 @@ def gmm_hrna_basepairs():
2759 GMM_histo(cHS_dist, "cHS", toric=False, hist=False, couleur='deeppink') 2689 GMM_histo(cHS_dist, "cHS", toric=False, hist=False, couleur='deeppink')
2760 GMM_histo(cSH_dist, "cSH", toric=False, hist=False, couleur='navy') 2690 GMM_histo(cSH_dist, "cSH", toric=False, hist=False, couleur='navy')
2761 plt.xlabel('Distance (Angström)') 2691 plt.xlabel('Distance (Angström)')
2762 - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis ("+str(nc)+ " valeurs)", fontsize=9) 2692 + plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis ("+str(nc)+ " valeurs)", fontsize=8)
2763 plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis (" +str(nc)+ " valeurs).png") 2693 plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis (" +str(nc)+ " valeurs).png")
2764 plt.close() 2694 plt.close()
2765 2695
...@@ -2774,36 +2704,13 @@ def gmm_hrna_basepairs(): ...@@ -2774,36 +2704,13 @@ def gmm_hrna_basepairs():
2774 GMM_histo(tHS_dist, "tHS", toric=False, hist=False, couleur='tan') 2704 GMM_histo(tHS_dist, "tHS", toric=False, hist=False, couleur='tan')
2775 GMM_histo(tSH_dist, "tSH", toric=False, hist=False, couleur='lime') 2705 GMM_histo(tSH_dist, "tSH", toric=False, hist=False, couleur='lime')
2776 plt.xlabel('Distance (Angström)') 2706 plt.xlabel('Distance (Angström)')
2777 - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans ("+str(nt)+ " valeurs)", fontsize=9) 2707 + plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans ("+str(nt)+ " valeurs)", fontsize=8)
2778 plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans (" +str(nt)+ " valeurs).png") 2708 plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans (" +str(nt)+ " valeurs).png")
2779 plt.close() 2709 plt.close()
2780 2710
2781 os.chdir(runDir) 2711 os.chdir(runDir)
2782 setproctitle(f"GMM (HiRE-RNA basepairs) finished") 2712 setproctitle(f"GMM (HiRE-RNA basepairs) finished")
2783 2713
2784 -def list_chains_in_dir(ld):
2785 - """
2786 - creates a dictionary of chains available in files from the ld list.
2787 - key = pdb identifier of the structure
2788 - value = list of RNA chains
2789 - """
2790 - dictionnaire=dict()
2791 - pdb=set()
2792 - for f in ld:
2793 - pdb_id = str.split(f, '_')[0]
2794 - pdb.add(pdb_id) # we create a list of distinct structures
2795 - for pdb_id in tqdm(pdb, desc="Scanning datapoints/ files content", leave=False): # for all structures found
2796 - liste_chaines = []
2797 - for f in ld:
2798 - if (len(f)<10): # unmapped to a Rfam family
2799 - chaine = str.split(f, '_')
2800 - if chaine[0] == pdb_id:
2801 - id_chain = chaine[2]
2802 - liste_chaines.append(id_chain)
2803 - if liste_chaines != []:
2804 - dictionnaire[pdb_id] = liste_chaines
2805 - return dictionnaire
2806 -
2807 @trace_unhandled_exceptions 2714 @trace_unhandled_exceptions
2808 def concat_dataframes(fpath, outfilename): 2715 def concat_dataframes(fpath, outfilename):
2809 """ 2716 """
...@@ -2989,6 +2896,7 @@ if __name__ == "__main__": ...@@ -2989,6 +2896,7 @@ if __name__ == "__main__":
2989 joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) 2896 joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
2990 2897
2991 # Do general family statistics 2898 # Do general family statistics
2899 +
2992 joblist.append(Job(function=stats_len)) # Computes figures about chain lengths 2900 joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
2993 joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) 2901 joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
2994 for f in famlist: 2902 for f in famlist:
...@@ -2996,27 +2904,20 @@ if __name__ == "__main__": ...@@ -2996,27 +2904,20 @@ if __name__ == "__main__":
2996 if f not in ignored: 2904 if f not in ignored:
2997 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) 2905 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
2998 2906
2907 +
2999 # Do geometric measures on all chains 2908 # Do geometric measures on all chains
2909 +
3000 if n_unmapped_chains: 2910 if n_unmapped_chains:
3001 os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) 2911 os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
3002 - f_prec = os.listdir(path_to_3D_data + "rna_only")[0] 2912 + liste_struct=os.listdir(path_to_3D_data + "renumbered_rna_only")
3003 - for f in os.listdir(path_to_3D_data + "rna_only"): 2913 + f_prec = os.listdir(path_to_3D_data + "renumbered_rna_only")[0]
2914 + if '4zdo_1_E.cif' in liste_struct:
2915 + liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
2916 + if '4zdp_1_E.cif' in liste_struct:
2917 + liste_struct.remove('4zdp_1_E.cif')
2918 + for f in liste_struct:
3004 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances 2919 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
3005 2920
3006 - # Basepair geometries statistics (from RNACifs/ 3D files)
3007 -
3008 - ld = os.listdir(path_to_3D_data +'datapoints')
3009 - if '4zdo_1_E' in ld :
3010 - ld.remove('4zdo_1_E') # weird cases to remove for now
3011 - if '4zdp_1_E' in ld :
3012 - ld.remove('4zdp_1_E')
3013 - chain_list = list_chains_in_dir(ld)
3014 - for c in chain_list.keys():
3015 - joblist.append(Job(function=measure_hrna_basepairs, args=(c,), how_many_in_parallel=nworkers))
3016 -
3017 -
3018 - #exit()
3019 -
3020 2921
3021 process_jobs(joblist) 2922 process_jobs(joblist)
3022 2923
...@@ -3037,6 +2938,7 @@ if __name__ == "__main__": ...@@ -3037,6 +2938,7 @@ if __name__ == "__main__":
3037 per_chain_stats() # per chain base frequencies en basepair types 2938 per_chain_stats() # per chain base frequencies en basepair types
3038 seq_idty() # identity matrices from pre-computed .npy matrices 2939 seq_idty() # identity matrices from pre-computed .npy matrices
3039 stats_pairs() 2940 stats_pairs()
2941 +
3040 if n_unmapped_chains: 2942 if n_unmapped_chains:
3041 general_stats() 2943 general_stats()
3042 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) 2944 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
...@@ -3062,4 +2964,4 @@ if __name__ == "__main__": ...@@ -3062,4 +2964,4 @@ if __name__ == "__main__":
3062 joblist.append(Job(function=gmm_wadley, args=())) 2964 joblist.append(Job(function=gmm_wadley, args=()))
3063 if len(joblist): 2965 if len(joblist):
3064 process_jobs(joblist) 2966 process_jobs(joblist)
3065 - 2967 +
......