Adapted measures to renumbered 3D structures
Former-commit-id: e1bb72c6
Showing
1 changed file
with
38 additions
and
136 deletions
... | @@ -1462,11 +1462,12 @@ def measure_from_structure(f): | ... | @@ -1462,11 +1462,12 @@ def measure_from_structure(f): |
1462 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) | 1462 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) |
1463 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | 1463 | warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) |
1464 | parser=MMCIFParser() | 1464 | parser=MMCIFParser() |
1465 | - s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f)) | 1465 | + s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "renumbered_rna_only/" + f)) |
1466 | 1466 | ||
1467 | measures_aa(name, s, thr_idx) | 1467 | measures_aa(name, s, thr_idx) |
1468 | if DO_HIRE_RNA_MEASURES: | 1468 | if DO_HIRE_RNA_MEASURES: |
1469 | measures_hrna(name, s, thr_idx) | 1469 | measures_hrna(name, s, thr_idx) |
1470 | + measures_hrna_basepairs(name, s, thr_idx) | ||
1470 | if DO_WADLEY_ANALYSIS: | 1471 | if DO_WADLEY_ANALYSIS: |
1471 | measures_wadley(name, s, thr_idx) | 1472 | measures_wadley(name, s, thr_idx) |
1472 | 1473 | ||
... | @@ -1793,58 +1794,31 @@ def measures_hrna(name, s, thr_idx): | ... | @@ -1793,58 +1794,31 @@ def measures_hrna(name, s, thr_idx): |
1793 | df.to_csv(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv') | 1794 | df.to_csv(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv') |
1794 | 1795 | ||
1795 | @trace_unhandled_exceptions | 1796 | @trace_unhandled_exceptions |
1796 | -def measure_hrna_basepairs(cle): | 1797 | +def measures_hrna_basepairs(name, s, thr_idx): |
1797 | """ | 1798 | """ |
1798 | - Open a complete RNAcifs/ file, and run measure_hrna_basepairs_chain() on every chain | 1799 | + Open a renumbered_rna_only/ file, and run measures_hrna_basepairs_chain() on every chain |
1799 | """ | 1800 | """ |
1800 | 1801 | ||
1801 | - global idxQueue | 1802 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_hrna_basepairs({name})") |
1802 | - thr_idx = idxQueue.get() | ||
1803 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_hrna_basepairs({cle})") | ||
1804 | 1803 | ||
1805 | - # Open the structure | ||
1806 | - with warnings.catch_warnings(): | ||
1807 | - # Ignore the PDB problems. This mostly warns that some chain is discontinuous. | ||
1808 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) | ||
1809 | - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) | ||
1810 | - parser=MMCIFParser() | ||
1811 | - s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "RNAcifs/" + cle + ".cif")) | ||
1812 | - | ||
1813 | l=[] | 1804 | l=[] |
1814 | - for model in s: | 1805 | + chain = next(s[0].get_chains()) |
1815 | - for valeur in chain_list[cle]: | ||
1816 | 1806 | ||
1817 | - # do not recompute something already computed | 1807 | + # do not recompute something already computed |
1818 | - if path.isfile(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs "+cle+'_'+valeur+".csv"): | 1808 | + if path.isfile(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs "+name+".csv"): |
1819 | - continue | 1809 | + return |
1820 | 1810 | ||
1821 | - if len(valeur) > 2: # if several RNA chains in the same structure | 1811 | + df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + name)) |
1822 | - df_tot=[] | 1812 | + |
1823 | - for id_chain in tqdm(valeur, desc=f"Worker {thr_idx+1}: Chains in {cle}", unit="chains", leave=False): | 1813 | + if df['index_chain'][0]==1:#ignore files with numbering errors |
1824 | - for data in ld: | 1814 | + l = measures_hrna_basepairs_chain(chain, df, thr_idx) |
1825 | - if (len(data)<10): #unmapped | 1815 | + |
1826 | - chaine=str.split(data, '_') | 1816 | + df_calc=pd.DataFrame(l, columns=["Chaine", "type LW", "Resseq", "Num paired", "Distance", "C4'-C1'-B1", "C1'-B1-B1pair", "B1-B1pair-C1'pair", "B1pair-C1'pair-C4'pair"]) |
1827 | - if chaine[0]==cle and chaine[2]==id_chain: | 1817 | + df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs '+name+'.csv') |
1828 | - df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + data)) | ||
1829 | - if df['index_chain'][0]==1:#ignore files with numbering errors | ||
1830 | - l = measure_hrna_basepairs_chain(model[id_chain], df, thr_idx) | ||
1831 | - else : # only one RNA chain | ||
1832 | - for data in ld: | ||
1833 | - if (len(data)<10): #unmapped | ||
1834 | - chaine=str.split(data, '_') | ||
1835 | - if chaine[0]==cle and chaine[2]==valeur: | ||
1836 | - df=pd.read_csv(os.path.abspath(path_to_3D_data + "datapoints/" + data)) | ||
1837 | - if df['index_chain'][0]==1: | ||
1838 | - l = measure_hrna_basepairs_chain(model[valeur], df, thr_idx) | ||
1839 | - | ||
1840 | - df_calc=pd.DataFrame(l, columns=["Chaine", "type LW", "Resseq", "Num paired", "Distance", "C4'-C1'-B1", "C1'-B1-B1pair", "B1-B1pair-C1'pair", "B1pair-C1'pair-C4'pair"]) | ||
1841 | - df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs '+cle+'_'+valeur+'.csv') | ||
1842 | 1818 | ||
1843 | - idxQueue.put(thr_idx) # replace the thread index in the queue | ||
1844 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
1845 | 1819 | ||
1846 | @trace_unhandled_exceptions | 1820 | @trace_unhandled_exceptions |
1847 | -def measure_hrna_basepairs_chain(chain, df, thr_idx): | 1821 | +def measures_hrna_basepairs_chain(chain, df, thr_idx): |
1848 | """ | 1822 | """ |
1849 | Cleanup of the dataset | 1823 | Cleanup of the dataset |
1850 | measurements of distances and angles between paired nucleotides in the chain | 1824 | measurements of distances and angles between paired nucleotides in the chain |
... | @@ -1888,74 +1862,30 @@ def measure_hrna_basepairs_chain(chain, df, thr_idx): | ... | @@ -1888,74 +1862,30 @@ def measure_hrna_basepairs_chain(chain, df, thr_idx): |
1888 | indexNames=pairs[pairs['paired_int'] == 0].index | 1862 | indexNames=pairs[pairs['paired_int'] == 0].index |
1889 | pairs.drop(indexNames, inplace=True)#deletion of lines with a 0 in paired_int (matching to another RNA chain) | 1863 | pairs.drop(indexNames, inplace=True)#deletion of lines with a 0 in paired_int (matching to another RNA chain) |
1890 | 1864 | ||
1891 | - for i in pairs.index: | 1865 | + for i in tqdm(pairs.index, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {chain} measures_hrna_basepairs_chain", unit="res", leave=False): |
1892 | """ | 1866 | """ |
1893 | calculations for each row of the pairs dataset | 1867 | calculations for each row of the pairs dataset |
1894 | """ | 1868 | """ |
1895 | - resseq=pairs.at[i, 'old_nt_resnum'] #number of the residue in the chain | ||
1896 | - #code to delete letters in old_nt_resnum | ||
1897 | - icode_res=' ' | ||
1898 | - if type(resseq) is str: | ||
1899 | - if resseq[0] != '-' : | ||
1900 | - while resseq.isdigit() is False: | ||
1901 | - l=len(resseq) | ||
1902 | - if icode_res==' ': | ||
1903 | - icode_res=resseq[l-1] | ||
1904 | - else : | ||
1905 | - icode_res=resseq[l-1]+icode_res | ||
1906 | - resseq=resseq[:l-1] | ||
1907 | - resseq=int(resseq) | ||
1908 | index=pairs.at[i, 'index_chain'] | 1869 | index=pairs.at[i, 'index_chain'] |
1909 | type_LW=pairs.at[i, 'pair_type_LW_bis'] #pairing type | 1870 | type_LW=pairs.at[i, 'pair_type_LW_bis'] #pairing type |
1910 | num_paired=pairs.at[i, 'paired_int'] #number (index_chain) of the paired nucleotide | 1871 | num_paired=pairs.at[i, 'paired_int'] #number (index_chain) of the paired nucleotide |
1911 | 1872 | ||
1912 | - | ||
1913 | if type(num_paired) is int or type(num_paired) is np.int64: | 1873 | if type(num_paired) is int or type(num_paired) is np.int64: |
1914 | - l=pairs[pairs['index_chain']==num_paired].index.to_list() | ||
1915 | - | ||
1916 | - resnum=pairs.at[l[0], 'old_nt_resnum'] | ||
1917 | - icode_pair=' ' | ||
1918 | - if type(resnum) is str: | ||
1919 | - if resnum[0] != '-' : | ||
1920 | - while resnum.isdigit() is False: | ||
1921 | - l=len(resnum) | ||
1922 | - if icode_pair==' ': | ||
1923 | - icode_pair=resnum[l-1] | ||
1924 | - else : | ||
1925 | - icode_pair=resnum[l-1]+icode_pair | ||
1926 | - resnum=resnum[:l-1] | ||
1927 | - | ||
1928 | - resnum=int(resnum) | ||
1929 | try : | 1874 | try : |
1930 | - d = basepair_apex_distance(chain[(' ',resseq, icode_res)], chain[(' ', resnum, icode_pair)]) # calculation of the distance between the tips of the paired nucleotides | 1875 | + d = basepair_apex_distance(chain[(' ',index, ' ')], chain[(' ', num_paired, ' ')]) |
1931 | - angle = basepair_flat_angle(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) | 1876 | + angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired, ' ')]) |
1932 | if d != 0.0: | 1877 | if d != 0.0: |
1933 | - liste_dist.append([chain, type_LW, resseq, resnum, d, angle[0], angle[1], angle[2], angle[3]]) | 1878 | + liste_dist.append([chain, type_LW, index, num_paired, d, angle[0], angle[1], angle[2], angle[3]]) |
1934 | except : | 1879 | except : |
1935 | pass | 1880 | pass |
1936 | else : | 1881 | else : |
1937 | for j in range(len(num_paired)): #if several pairings, process them one by one | 1882 | for j in range(len(num_paired)): #if several pairings, process them one by one |
1938 | if num_paired[j] != 0 : | 1883 | if num_paired[j] != 0 : |
1939 | - l=pairs[pairs['index_chain']==num_paired[j]].index.to_list() | ||
1940 | - | ||
1941 | - resnum=pairs.at[l[0], 'old_nt_resnum'] | ||
1942 | - | ||
1943 | - icode_pair=' ' | ||
1944 | - if type(resnum) is str: | ||
1945 | - if resnum[0] != '-' : | ||
1946 | - while resnum.isdigit() is False: | ||
1947 | - l=len(resnum) | ||
1948 | - if icode_pair==' ': | ||
1949 | - icode_pair=resnum[l-1] | ||
1950 | - else : | ||
1951 | - icode_pair=resnum[l-1]+icode_pair | ||
1952 | - resnum=resnum[:l-1] | ||
1953 | - resnum=int(resnum) | ||
1954 | try : | 1884 | try : |
1955 | - d = basepair_apex_distance(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) | 1885 | + d = basepair_apex_distance(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')]) |
1956 | - angle = basepair_flat_angle(chain[(' ', resseq, icode_res)], chain[(' ', resnum, icode_pair)]) | 1886 | + angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')]) |
1957 | if d != 0.0: | 1887 | if d != 0.0: |
1958 | - liste_dist.append([chain, type_LW[j], resseq, resnum, d, angle[0], angle[1], angle[2], angle[3]]) | 1888 | + liste_dist.append([chain, type_LW[j], index, num_paired[j], d, angle[0], angle[1], angle[2], angle[3]]) |
1959 | except: | 1889 | except: |
1960 | pass | 1890 | pass |
1961 | 1891 | ||
... | @@ -2759,7 +2689,7 @@ def gmm_hrna_basepairs(): | ... | @@ -2759,7 +2689,7 @@ def gmm_hrna_basepairs(): |
2759 | GMM_histo(cHS_dist, "cHS", toric=False, hist=False, couleur='deeppink') | 2689 | GMM_histo(cHS_dist, "cHS", toric=False, hist=False, couleur='deeppink') |
2760 | GMM_histo(cSH_dist, "cSH", toric=False, hist=False, couleur='navy') | 2690 | GMM_histo(cSH_dist, "cSH", toric=False, hist=False, couleur='navy') |
2761 | plt.xlabel('Distance (Angström)') | 2691 | plt.xlabel('Distance (Angström)') |
2762 | - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis ("+str(nc)+ " valeurs)", fontsize=9) | 2692 | + plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis ("+str(nc)+ " valeurs)", fontsize=8) |
2763 | plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis (" +str(nc)+ " valeurs).png") | 2693 | plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis (" +str(nc)+ " valeurs).png") |
2764 | plt.close() | 2694 | plt.close() |
2765 | 2695 | ||
... | @@ -2774,36 +2704,13 @@ def gmm_hrna_basepairs(): | ... | @@ -2774,36 +2704,13 @@ def gmm_hrna_basepairs(): |
2774 | GMM_histo(tHS_dist, "tHS", toric=False, hist=False, couleur='tan') | 2704 | GMM_histo(tHS_dist, "tHS", toric=False, hist=False, couleur='tan') |
2775 | GMM_histo(tSH_dist, "tSH", toric=False, hist=False, couleur='lime') | 2705 | GMM_histo(tSH_dist, "tSH", toric=False, hist=False, couleur='lime') |
2776 | plt.xlabel('Distance (Angström)') | 2706 | plt.xlabel('Distance (Angström)') |
2777 | - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans ("+str(nt)+ " valeurs)", fontsize=9) | 2707 | + plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans ("+str(nt)+ " valeurs)", fontsize=8) |
2778 | plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans (" +str(nt)+ " valeurs).png") | 2708 | plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans (" +str(nt)+ " valeurs).png") |
2779 | plt.close() | 2709 | plt.close() |
2780 | 2710 | ||
2781 | os.chdir(runDir) | 2711 | os.chdir(runDir) |
2782 | setproctitle(f"GMM (HiRE-RNA basepairs) finished") | 2712 | setproctitle(f"GMM (HiRE-RNA basepairs) finished") |
2783 | 2713 | ||
2784 | -def list_chains_in_dir(ld): | ||
2785 | - """ | ||
2786 | - creates a dictionary of chains available in files from the ld list. | ||
2787 | - key = pdb identifier of the structure | ||
2788 | - value = list of RNA chains | ||
2789 | - """ | ||
2790 | - dictionnaire=dict() | ||
2791 | - pdb=set() | ||
2792 | - for f in ld: | ||
2793 | - pdb_id = str.split(f, '_')[0] | ||
2794 | - pdb.add(pdb_id) # we create a list of distinct structures | ||
2795 | - for pdb_id in tqdm(pdb, desc="Scanning datapoints/ files content", leave=False): # for all structures found | ||
2796 | - liste_chaines = [] | ||
2797 | - for f in ld: | ||
2798 | - if (len(f)<10): # unmapped to a Rfam family | ||
2799 | - chaine = str.split(f, '_') | ||
2800 | - if chaine[0] == pdb_id: | ||
2801 | - id_chain = chaine[2] | ||
2802 | - liste_chaines.append(id_chain) | ||
2803 | - if liste_chaines != []: | ||
2804 | - dictionnaire[pdb_id] = liste_chaines | ||
2805 | - return dictionnaire | ||
2806 | - | ||
2807 | @trace_unhandled_exceptions | 2714 | @trace_unhandled_exceptions |
2808 | def concat_dataframes(fpath, outfilename): | 2715 | def concat_dataframes(fpath, outfilename): |
2809 | """ | 2716 | """ |
... | @@ -2989,6 +2896,7 @@ if __name__ == "__main__": | ... | @@ -2989,6 +2896,7 @@ if __name__ == "__main__": |
2989 | joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) | 2896 | joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) |
2990 | 2897 | ||
2991 | # Do general family statistics | 2898 | # Do general family statistics |
2899 | + | ||
2992 | joblist.append(Job(function=stats_len)) # Computes figures about chain lengths | 2900 | joblist.append(Job(function=stats_len)) # Computes figures about chain lengths |
2993 | joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) | 2901 | joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) |
2994 | for f in famlist: | 2902 | for f in famlist: |
... | @@ -2996,27 +2904,20 @@ if __name__ == "__main__": | ... | @@ -2996,27 +2904,20 @@ if __name__ == "__main__": |
2996 | if f not in ignored: | 2904 | if f not in ignored: |
2997 | joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) | 2905 | joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) |
2998 | 2906 | ||
2907 | + | ||
2999 | # Do geometric measures on all chains | 2908 | # Do geometric measures on all chains |
2909 | + | ||
3000 | if n_unmapped_chains: | 2910 | if n_unmapped_chains: |
3001 | os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) | 2911 | os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) |
3002 | - f_prec = os.listdir(path_to_3D_data + "rna_only")[0] | 2912 | + liste_struct=os.listdir(path_to_3D_data + "renumbered_rna_only") |
3003 | - for f in os.listdir(path_to_3D_data + "rna_only"): | 2913 | + f_prec = os.listdir(path_to_3D_data + "renumbered_rna_only")[0] |
2914 | + if '4zdo_1_E.cif' in liste_struct: | ||
2915 | + liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now | ||
2916 | + if '4zdp_1_E.cif' in liste_struct: | ||
2917 | + liste_struct.remove('4zdp_1_E.cif') | ||
2918 | + for f in liste_struct: | ||
3004 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances | 2919 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances |
3005 | 2920 | ||
3006 | - # Basepair geometries statistics (from RNACifs/ 3D files) | ||
3007 | - | ||
3008 | - ld = os.listdir(path_to_3D_data +'datapoints') | ||
3009 | - if '4zdo_1_E' in ld : | ||
3010 | - ld.remove('4zdo_1_E') # weird cases to remove for now | ||
3011 | - if '4zdp_1_E' in ld : | ||
3012 | - ld.remove('4zdp_1_E') | ||
3013 | - chain_list = list_chains_in_dir(ld) | ||
3014 | - for c in chain_list.keys(): | ||
3015 | - joblist.append(Job(function=measure_hrna_basepairs, args=(c,), how_many_in_parallel=nworkers)) | ||
3016 | - | ||
3017 | - | ||
3018 | - #exit() | ||
3019 | - | ||
3020 | 2921 | ||
3021 | process_jobs(joblist) | 2922 | process_jobs(joblist) |
3022 | 2923 | ||
... | @@ -3037,6 +2938,7 @@ if __name__ == "__main__": | ... | @@ -3037,6 +2938,7 @@ if __name__ == "__main__": |
3037 | per_chain_stats() # per chain base frequencies en basepair types | 2938 | per_chain_stats() # per chain base frequencies en basepair types |
3038 | seq_idty() # identity matrices from pre-computed .npy matrices | 2939 | seq_idty() # identity matrices from pre-computed .npy matrices |
3039 | stats_pairs() | 2940 | stats_pairs() |
2941 | + | ||
3040 | if n_unmapped_chains: | 2942 | if n_unmapped_chains: |
3041 | general_stats() | 2943 | general_stats() |
3042 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) | 2944 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) |
... | @@ -3062,4 +2964,4 @@ if __name__ == "__main__": | ... | @@ -3062,4 +2964,4 @@ if __name__ == "__main__": |
3062 | joblist.append(Job(function=gmm_wadley, args=())) | 2964 | joblist.append(Job(function=gmm_wadley, args=())) |
3063 | if len(joblist): | 2965 | if len(joblist): |
3064 | process_jobs(joblist) | 2966 | process_jobs(joblist) |
3065 | - | 2967 | + | ... | ... |
-
Please register or login to post a comment