Louis BECQUEY

split HRNA bp by LW type

Showing 1 changed file with 404 additions and 433 deletions
...@@ -19,6 +19,7 @@ import matplotlib.patches as mpatches ...@@ -19,6 +19,7 @@ import matplotlib.patches as mpatches
19 import scipy.cluster.hierarchy as sch 19 import scipy.cluster.hierarchy as sch
20 import sklearn 20 import sklearn
21 import json 21 import json
22 +import glob
22 import pickle 23 import pickle
23 import Bio 24 import Bio
24 from scipy.spatial.distance import squareform 25 from scipy.spatial.distance import squareform
...@@ -1377,72 +1378,107 @@ def pos_b2(res): ...@@ -1377,72 +1378,107 @@ def pos_b2(res):
1377 else: 1378 else:
1378 return [] 1379 return []
1379 1380
1380 -def basepair_apex_distance(res, pair): 1381 +@trace_unhandled_exceptions
1381 - """ 1382 +def basepair_measures(res, pair):
1382 - measure of the distance between the tips of the paired nucleotides (B1 / B1 or B1 / B2 or B2 / B2)
1383 - """
1384 - dist=[]
1385 - d=0
1386 - if res.get_resname()=='A' or res.get_resname()=='G' :# different cases if 1 aromatic cycle or 2
1387 - atom_res=pos_b2(res)
1388 - if pair.get_resname()=='A' or pair.get_resname()=='G' :
1389 - atom_pair=pos_b2(pair)
1390 - if pair.get_resname()=='C' or pair.get_resname()=='U' :
1391 - atom_pair=pos_b1(pair)
1392 -
1393 - if res.get_resname()=='C' or res.get_resname()=='U' :
1394 - atom_res=pos_b1(res)
1395 - if pair.get_resname()=='A' or pair.get_resname()=='G' :
1396 - atom_pair=pos_b2(pair)
1397 - if pair.get_resname()=='C' or pair.get_resname()=='U' :
1398 - atom_pair=pos_b1(pair)
1399 -
1400 - dist = get_euclidian_distance(atom_res, atom_pair)
1401 -
1402 - return dist
1403 -
1404 -def basepair_flat_angle(res, pair):
1405 """ 1383 """
1406 - measurement of the plane angles formed by the vectors C1->B1 of the paired nucleotides 1384 + measurement of the flat angles describing a basepair in the HiRE-RNA model
1407 """ 1385 """
1408 if res.get_resname()=='C' or res.get_resname()=='U' : 1386 if res.get_resname()=='C' or res.get_resname()=='U' :
1409 atom_c4_res = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ] 1387 atom_c4_res = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
1410 atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ] 1388 atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
1411 atom_b1_res = pos_b1(res) 1389 atom_b1_res = pos_b1(res)
1412 - a1_res = Vector(atom_c4_res[0]) 1390 + if not len(atom_c4_res) or not len(atom_c1p_res) or not len(atom_b1_res):
1391 + return
1392 + a3_res = Vector(atom_c4_res[0])
1413 a2_res = Vector(atom_c1p_res[0]) 1393 a2_res = Vector(atom_c1p_res[0])
1414 - a3_res = Vector(atom_b1_res[0]) 1394 + a1_res = Vector(atom_b1_res[0])
1415 if res.get_resname()=='A' or res.get_resname()=='G' : 1395 if res.get_resname()=='A' or res.get_resname()=='G' :
1416 atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ] 1396 atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
1417 atom_b1_res = pos_b1(res) 1397 atom_b1_res = pos_b1(res)
1418 atom_b2_res = pos_b2(res) 1398 atom_b2_res = pos_b2(res)
1419 - a1_res = Vector(atom_c1p_res[0]) 1399 + if not len(atom_c1p_res) or not len(atom_b1_res) or not len(atom_b2_res):
1400 + return
1401 + a3_res = Vector(atom_c1p_res[0])
1420 a2_res = Vector(atom_b1_res[0]) 1402 a2_res = Vector(atom_b1_res[0])
1421 - a3_res = Vector(atom_b2_res[0]) 1403 + a1_res = Vector(atom_b2_res[0])
1422 1404
1423 if pair.get_resname()=='C' or pair.get_resname()=='U' : 1405 if pair.get_resname()=='C' or pair.get_resname()=='U' :
1424 atom_c4_pair = [ atom.get_coord() for atom in pair if "C4'" in atom.get_fullname() ] 1406 atom_c4_pair = [ atom.get_coord() for atom in pair if "C4'" in atom.get_fullname() ]
1425 atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ] 1407 atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ]
1426 atom_b1_pair = pos_b1(pair) 1408 atom_b1_pair = pos_b1(pair)
1427 - a1_pair = Vector(atom_c4_pair[0]) 1409 + if not len(atom_c4_pair) or not len(atom_c1p_pair) or not len(atom_b1_pair):
1410 + return
1411 + a3_pair = Vector(atom_c4_pair[0])
1428 a2_pair = Vector(atom_c1p_pair[0]) 1412 a2_pair = Vector(atom_c1p_pair[0])
1429 - a3_pair = Vector(atom_b1_pair) 1413 + a1_pair = Vector(atom_b1_pair[0])
1430 if pair.get_resname()=='A' or pair.get_resname()=='G' : 1414 if pair.get_resname()=='A' or pair.get_resname()=='G' :
1431 atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ] 1415 atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ]
1432 atom_b1_pair = pos_b1(pair) 1416 atom_b1_pair = pos_b1(pair)
1433 atom_b2_pair = pos_b2(pair) 1417 atom_b2_pair = pos_b2(pair)
1434 - a1_pair = Vector(atom_c1p_pair[0]) 1418 + if not len(atom_c1p_pair) or not len(atom_b1_pair) or not len(atom_b2_pair): # No C1' atom in the paired nucleotide, skip measures.
1419 + return
1420 + a3_pair = Vector(atom_c1p_pair[0])
1435 a2_pair = Vector(atom_b1_pair[0]) 1421 a2_pair = Vector(atom_b1_pair[0])
1436 - a3_pair = Vector(atom_b2_pair[0]) 1422 + a1_pair = Vector(atom_b2_pair[0])
1437 1423
1438 - # we calculate the 4 plane angles including these vectors 1424 + # Bond vectors
1439 - 1425 + res_32 = a3_res - a2_res
1440 - a = calc_angle(a1_res, a2_res, a3_res)*(180/np.pi) 1426 + res_12 = a1_res - a2_res
1441 - b = calc_angle(a2_res, a3_res, a3_pair)*(180/np.pi) 1427 + pair_32 = a3_pair - a2_pair
1442 - c = calc_angle(a3_res, a3_pair, a2_pair)*(180/np.pi) 1428 + pair_12 = a1_pair - a2_pair
1443 - d = calc_angle(a3_pair, a2_pair, a1_pair)*(180/np.pi) 1429 + rho = a1_res - a1_pair # from pair to res
1444 - angles = [a, b, c, d] 1430 +
1445 - return angles 1431 + # dist
1432 + dist = rho.norm()
1433 +
1434 + # we calculate the 2 plane angles
1435 + with warnings.catch_warnings():
1436 + warnings.simplefilter('ignore', RuntimeWarning)
1437 + b = res_12.angle(rho)*(180/np.pi) # equal to the previous implementation
1438 + c = pair_12.angle(-rho)*(180/np.pi) #
1439 + # a = calc_angle(a1_res, a2_res, a3_res)*(180/np.pi) # not required
1440 + # b = calc_angle(a2_res, a1_res, a1_pair)*(180/np.pi)
1441 + # c = calc_angle(a1_res, a1_pair, a2_pair)*(180/np.pi)
1442 + # d = calc_angle(a3_pair, a2_pair, a1_pair)*(180/np.pi) # not required
1443 +
1444 + # Compute plane vectors
1445 + n1 = (res_32**res_12).normalized() # ** between vectors, is the cross product
1446 + n2 = (pair_32**pair_12).normalized()
1447 +
1448 + # Distances between base tip and the other base's plane (orthogonal projection)
1449 + # if angle(rho, n) > pi/2 the distance is negative (signed following n)
1450 + d1 = rho*n1 # projection of rho on axis n1
1451 + d2 = rho*n2
1452 +
1453 + # Now the projection of rho in the planes. It's just a sum of the triangles' two other edges.
1454 + p1 = (-rho+n1**d1).normalized() # between vector and scalar, ** is the multiplication by a scalar
1455 + p2 = (rho-n2**d2).normalized()
1456 +
1457 + # Measure tau, the dihedral
1458 + u = (res_12**rho).normalized()
1459 + v = (rho**pair_12).normalized()
1460 + w1 = n1**u
1461 + w2 = v**n2
1462 + cosTau1 = n1*u
1463 + cosTau2 = v*n2
1464 + tau1 = np.arccos(cosTau1)*(180/np.pi)
1465 + tau2 = np.arccos(cosTau2)*(180/np.pi)
1466 + if res_12*w1 < 0:
1467 + tau1 = -tau1
1468 + if pair_12*w2 < 0:
1469 + tau2 = -tau2
1470 +
1471 + # And finally, the a1 and a2 angles between res_12 and p1 / pair_12 and p2
1472 + with warnings.catch_warnings():
1473 + warnings.simplefilter('ignore', RuntimeWarning)
1474 + a1 = res_12.angle(p1)*(180/np.pi)
1475 + a2 = pair_12.angle(p2)*(180/np.pi)
1476 + if cosTau1 < 0:
1477 + a1 = -a1
1478 + if cosTau2 < 0:
1479 + a2 = -a2
1480 +
1481 + return [dist, b, c, d1, d2, a1, a2, tau1, tau2]
1446 1482
1447 @trace_unhandled_exceptions 1483 @trace_unhandled_exceptions
1448 def measure_from_structure(f): 1484 def measure_from_structure(f):
...@@ -1482,7 +1518,7 @@ def measures_wadley(name, s, thr_idx): ...@@ -1482,7 +1518,7 @@ def measures_wadley(name, s, thr_idx):
1482 """ 1518 """
1483 1519
1484 # do not recompute something already computed 1520 # do not recompute something already computed
1485 - if (path.isfile(runDir + '/results/geometry/Pyle/angles/angles_plans_wadley ' + name + '.csv') and 1521 + if (path.isfile(runDir + '/results/geometry/Pyle/angles/flat_angles_pyle ' + name + '.csv') and
1486 path.isfile(runDir + "/results/geometry/Pyle/distances/distances_wadley " + name + ".csv")): 1522 path.isfile(runDir + "/results/geometry/Pyle/distances/distances_wadley " + name + ".csv")):
1487 return 1523 return
1488 1524
...@@ -1524,7 +1560,7 @@ def measures_wadley(name, s, thr_idx): ...@@ -1524,7 +1560,7 @@ def measures_wadley(name, s, thr_idx):
1524 df = pd.DataFrame(liste_dist, columns=["Residu", "C1'-P", "P-C1'", "C4'-P", "P-C4'"]) 1560 df = pd.DataFrame(liste_dist, columns=["Residu", "C1'-P", "P-C1'", "C4'-P", "P-C4'"])
1525 df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_wadley " + name + ".csv") 1561 df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_wadley " + name + ".csv")
1526 df = pd.DataFrame(liste_angl, columns=["Residu", "P-C1'-P°", "C1'-P°-C1'°"]) 1562 df = pd.DataFrame(liste_angl, columns=["Residu", "P-C1'-P°", "C1'-P°-C1'°"])
1527 - df.to_csv(runDir + "/results/geometry/Pyle/angles/angles_plans_wadley "+name+".csv") 1563 + df.to_csv(runDir + "/results/geometry/Pyle/angles/flat_angles_pyle "+name+".csv")
1528 1564
1529 @trace_unhandled_exceptions 1565 @trace_unhandled_exceptions
1530 def measures_aa(name, s, thr_idx): 1566 def measures_aa(name, s, thr_idx):
...@@ -1533,7 +1569,7 @@ def measures_aa(name, s, thr_idx): ...@@ -1533,7 +1569,7 @@ def measures_aa(name, s, thr_idx):
1533 """ 1569 """
1534 1570
1535 # do not recompute something already computed 1571 # do not recompute something already computed
1536 - if path.isfile(runDir+"/results/geometry/all-atoms/distances/dist_atoms "+name+".csv"): 1572 + if path.isfile(runDir+"/results/geometry/all-atoms/distances/dist_atoms_"+name+".csv"):
1537 return 1573 return
1538 1574
1539 last_o3p = [] # o3 'of the previous nucleotide linked to the P of the current nucleotide 1575 last_o3p = [] # o3 'of the previous nucleotide linked to the P of the current nucleotide
...@@ -1685,7 +1721,7 @@ def measures_aa(name, s, thr_idx): ...@@ -1685,7 +1721,7 @@ def measures_aa(name, s, thr_idx):
1685 df=pd.concat([df_comm, df_pur, df_pyr], axis = 1) 1721 df=pd.concat([df_comm, df_pur, df_pyr], axis = 1)
1686 pbar.close() 1722 pbar.close()
1687 1723
1688 - df.to_csv(runDir + "/results/geometry/all-atoms/distances/dist_atoms " + name + ".csv") 1724 + df.to_csv(runDir + "/results/geometry/all-atoms/distances/dist_atoms_" + name + ".csv")
1689 1725
1690 @trace_unhandled_exceptions 1726 @trace_unhandled_exceptions
1691 def measures_hrna(name, s, thr_idx): 1727 def measures_hrna(name, s, thr_idx):
...@@ -1810,89 +1846,87 @@ def measures_hrna_basepairs(name, s, thr_idx): ...@@ -1810,89 +1846,87 @@ def measures_hrna_basepairs(name, s, thr_idx):
1810 1846
1811 df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + name)) 1847 df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + name))
1812 1848
1813 - if df['index_chain'][0]==1:#ignore files with numbering errors 1849 + if df['index_chain'][0] == 1: # ignore files with numbering errors : TODO : remove when we get DSSR Pro, there should not be numbering errors anymore
1814 - l = measures_hrna_basepairs_chain(chain, df, thr_idx) 1850 + l = measures_hrna_basepairs_chain(name, chain, df, thr_idx)
1815 - 1851 + df_calc = pd.DataFrame(l, columns=["type_LW", "nt1_idx", "nt1_res", "nt2_idx", "nt2_res", "Distance",
1816 - df_calc=pd.DataFrame(l, columns=["Chaine", "type LW", "Resseq", "Num paired", "Distance", "C4'-C1'-B1", "C1'-B1-B1pair", "B1-B1pair-C1'pair", "B1pair-C1'pair-C4'pair"]) 1852 + "211_angle", "112_angle", "dB1", "dB2", "alpha1", "alpha2", "3211_torsion", "1123_torsion"])
1817 - df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs '+name+'.csv') 1853 + df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs ' + name + '.csv', float_format="%.3f")
1818 -
1819 1854
1820 @trace_unhandled_exceptions 1855 @trace_unhandled_exceptions
1821 -def measures_hrna_basepairs_chain(chain, df, thr_idx): 1856 +def measures_hrna_basepairs_chain(name, chain, df, thr_idx):
1822 """ 1857 """
1823 Cleanup of the dataset 1858 Cleanup of the dataset
1824 measurements of distances and angles between paired nucleotides in the chain 1859 measurements of distances and angles between paired nucleotides in the chain
1825 """ 1860 """
1826 1861
1827 - liste_dist=[] 1862 + results = []
1828 warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) 1863 warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
1829 1864
1830 pairs = df[['index_chain', 'old_nt_resnum', 'paired', 'pair_type_LW']] # columns we keep 1865 pairs = df[['index_chain', 'old_nt_resnum', 'paired', 'pair_type_LW']] # columns we keep
1831 - for i in range(pairs.shape[0]): #we remove the lines where no pairing (NaN in paired) 1866 + for i in range(pairs.shape[0]): # we remove the lines where no pairing (NaN in paired)
1832 - index_with_nan=pairs.index[pairs.iloc[:,2].isnull()] 1867 + index_with_nan = pairs.index[pairs.iloc[:,2].isnull()]
1833 pairs.drop(index_with_nan, 0, inplace=True) 1868 pairs.drop(index_with_nan, 0, inplace=True)
1834 1869
1835 - paired_int=[] 1870 + paired_int = []
1836 - for i in pairs.index:# convert values ​​from paired to integers or lists of integers 1871 + for i in pairs.index: # convert values ​​from paired to integers or lists of integers
1837 - paired=pairs.at[i, 'paired'] 1872 + paired = pairs.at[i, 'paired']
1838 if type(paired) is np.int64 or type(paired) is np.float64: 1873 if type(paired) is np.int64 or type(paired) is np.float64:
1839 paired_int.append(int(paired)) 1874 paired_int.append(int(paired))
1840 else : #strings 1875 else : #strings
1841 - if len(paired)<3 : #a single pairing 1876 + if len(paired) < 3: # a single pairing
1842 paired_int.append(int(paired)) 1877 paired_int.append(int(paired))
1843 - else : #several pairings 1878 + else : # several pairings
1844 - paired=paired.split(',') 1879 + paired = paired.split(',')
1845 - l=[int(i) for i in paired] 1880 + l = [ int(i) for i in paired ]
1846 paired_int.append(l) 1881 paired_int.append(l)
1847 1882
1848 - pair_type_LW_bis=[] 1883 + pair_type_LW_bis = []
1849 for j in pairs.index: 1884 for j in pairs.index:
1850 pair_type_LW = pairs.at[j, 'pair_type_LW'] 1885 pair_type_LW = pairs.at[j, 'pair_type_LW']
1851 - if len(pair_type_LW)<4 : #a single pairing 1886 + if len(pair_type_LW) < 4 : # a single pairing
1852 pair_type_LW_bis.append(pair_type_LW) 1887 pair_type_LW_bis.append(pair_type_LW)
1853 - else : #several pairings 1888 + else : # several pairings
1854 - pair_type_LW=pair_type_LW.split(',') 1889 + pair_type_LW = pair_type_LW.split(',')
1855 - l=[i for i in pair_type_LW] 1890 + l = [ i for i in pair_type_LW ]
1856 pair_type_LW_bis.append(pair_type_LW) 1891 pair_type_LW_bis.append(pair_type_LW)
1857 1892
1858 - #addition of these new columns 1893 + # addition of these new columns
1859 pairs.insert(4, "paired_int", paired_int, True) 1894 pairs.insert(4, "paired_int", paired_int, True)
1860 pairs.insert(5, "pair_type_LW_bis", pair_type_LW_bis, True) 1895 pairs.insert(5, "pair_type_LW_bis", pair_type_LW_bis, True)
1861 1896
1862 - indexNames=pairs[pairs['paired_int'] == 0].index 1897 + indexNames = pairs[pairs['paired_int'] == 0].index
1863 - pairs.drop(indexNames, inplace=True)#deletion of lines with a 0 in paired_int (matching to another RNA chain) 1898 + pairs.drop(indexNames, inplace=True) # deletion of lines with a 0 in paired_int (matching to another RNA chain)
1864 - 1899 +
1865 - for i in tqdm(pairs.index, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {chain} measures_hrna_basepairs_chain", unit="res", leave=False): 1900 + for i in tqdm(pairs.index, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measures_hrna_basepairs_chain", unit="res", leave=False):
1866 - """ 1901 + # calculations for each row of the pairs dataset
1867 - calculations for each row of the pairs dataset 1902 + index = pairs.at[i, 'index_chain']
1868 - """ 1903 + res1 = chain[(' ', index, ' ')].get_resname()
1869 - index=pairs.at[i, 'index_chain'] 1904 + if res1 not in ['A','C','G','U']:
1870 - type_LW=pairs.at[i, 'pair_type_LW_bis'] #pairing type 1905 + continue
1871 - num_paired=pairs.at[i, 'paired_int'] #number (index_chain) of the paired nucleotide 1906 + type_LW = pairs.at[i, 'pair_type_LW_bis'] # pairing type
1907 + num_paired = pairs.at[i, 'paired_int'] # number (index_chain) of the paired nucleotide
1872 1908
1873 if type(num_paired) is int or type(num_paired) is np.int64: 1909 if type(num_paired) is int or type(num_paired) is np.int64:
1874 - try : 1910 + res2 = chain[(' ', num_paired, ' ')].get_resname()
1875 - d = basepair_apex_distance(chain[(' ',index, ' ')], chain[(' ', num_paired, ' ')]) 1911 + if res2 not in ["A","C","G","U"]:
1876 - angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired, ' ')]) 1912 + continue
1877 - if d != 0.0: 1913 + measures = basepair_measures(chain[(' ', index, ' ')], chain[(' ', num_paired, ' ')])
1878 - liste_dist.append([chain, type_LW, index, num_paired, d, angle[0], angle[1], angle[2], angle[3]]) 1914 + if measures is not None:
1879 - except : 1915 + results.append([type_LW, index, res1, num_paired, res2] + measures)
1880 - pass 1916 + else:
1881 - else : 1917 + for j in range(len(num_paired)): # if several pairings, process them one by one
1882 - for j in range(len(num_paired)): #if several pairings, process them one by one 1918 + if num_paired[j] != 0:
1883 - if num_paired[j] != 0 : 1919 + res2 = chain[(' ', num_paired[j], ' ')].get_resname()
1884 - try : 1920 + if res2 not in ["A","C","G","U"]:
1885 - d = basepair_apex_distance(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')]) 1921 + continue
1886 - angle = basepair_flat_angle(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')]) 1922 + measures = basepair_measures(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')])
1887 - if d != 0.0: 1923 + if measures is not None:
1888 - liste_dist.append([chain, type_LW[j], index, num_paired[j], d, angle[0], angle[1], angle[2], angle[3]]) 1924 + results.append([type_LW[j], index, res1, num_paired[j], res2] + measures)
1889 - except: 1925 +
1890 - pass 1926 + return results
1891 -
1892 - return(liste_dist)
1893 1927
1894 @trace_unhandled_exceptions 1928 @trace_unhandled_exceptions
1895 -def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=True) : 1929 +def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) :
1896 """ 1930 """
1897 Plot Gaussian-Mixture-Model (with or without histograms) 1931 Plot Gaussian-Mixture-Model (with or without histograms)
1898 """ 1932 """
...@@ -1906,8 +1940,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr ...@@ -1906,8 +1940,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr
1906 1940
1907 # chooses the number of components based on the maximum likelihood value (maxlogv) 1941 # chooses the number of components based on the maximum likelihood value (maxlogv)
1908 n_components_range = np.arange(8)+1 1942 n_components_range = np.arange(8)+1
1909 - aic = [] 1943 + # aic = []
1910 - bic = [] 1944 + # bic = []
1911 maxlogv=[] 1945 maxlogv=[]
1912 md = np.array(data).reshape(-1,1) 1946 md = np.array(data).reshape(-1,1)
1913 nb_components = 1 1947 nb_components = 1
...@@ -1915,8 +1949,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr ...@@ -1915,8 +1949,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr
1915 log_max = 0 1949 log_max = 0
1916 for n_comp in n_components_range: 1950 for n_comp in n_components_range:
1917 gmm = GaussianMixture(n_components=n_comp).fit(md) 1951 gmm = GaussianMixture(n_components=n_comp).fit(md)
1918 - aic.append(abs(gmm.aic(md))) 1952 + # aic.append(abs(gmm.aic(md)))
1919 - bic.append(abs(gmm.bic(md))) 1953 + # bic.append(abs(gmm.bic(md)))
1920 maxlogv.append(gmm.lower_bound_) 1954 maxlogv.append(gmm.lower_bound_)
1921 if gmm.lower_bound_== max(maxlogv) : # takes the maximum 1955 if gmm.lower_bound_== max(maxlogv) : # takes the maximum
1922 nb_components = n_comp 1956 nb_components = n_comp
...@@ -1962,10 +1996,10 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr ...@@ -1962,10 +1996,10 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr
1962 if hist: 1996 if hist:
1963 plt.hist(data_ori, color="green", edgecolor='black', linewidth=1.2, bins=50, density=True) 1997 plt.hist(data_ori, color="green", edgecolor='black', linewidth=1.2, bins=50, density=True)
1964 if toric: 1998 if toric:
1965 - plt.xlabel("Angle (Degré)") 1999 + plt.xlabel("Angle (Degrees)")
1966 else: 2000 else:
1967 - plt.xlabel("Distance (Angström)") 2001 + plt.xlabel("Distance (Angströms)")
1968 - plt.ylabel("Densité") 2002 + plt.ylabel("Density")
1969 2003
1970 # Prepare the GMM curve with some absciss points 2004 # Prepare the GMM curve with some absciss points
1971 if toric: 2005 if toric:
...@@ -1985,7 +2019,7 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr ...@@ -1985,7 +2019,7 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr
1985 summary_data["std"] = [] 2019 summary_data["std"] = []
1986 2020
1987 # plot 2021 # plot
1988 - courbes = [] 2022 + curves = []
1989 for i in range(nb_components): 2023 for i in range(nb_components):
1990 2024
1991 # store the parameters 2025 # store the parameters
...@@ -2022,25 +2056,25 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr ...@@ -2022,25 +2056,25 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, couleur=None, save=Tr
2022 plt.plot(newx, newy, c=colors[i]) 2056 plt.plot(newx, newy, c=colors[i])
2023 else: 2057 else:
2024 # store for later summation 2058 # store for later summation
2025 - courbes.append(np.array(newy)) 2059 + curves.append(np.array(newy))
2026 2060
2027 if hist: 2061 if hist:
2028 - plt.title("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(len(data_ori))+" valeurs)") 2062 + plt.title(f"Histogram of {name_data} with GMM of {nb_components} components (" + str(len(data_ori))+" values)")
2029 if save: 2063 if save:
2030 - plt.savefig("Histogramme " +name_data+ " avec GMM pour " +str(nb_components)+ " composantes (" + str(len(data_ori))+" valeurs).png") 2064 + plt.savefig(f"Histogram_{name_data}_{nb_components}_comps.png")
2031 plt.close() 2065 plt.close()
2032 else: 2066 else:
2033 # Plot their sum, do not save figure yet 2067 # Plot their sum, do not save figure yet
2034 try: 2068 try:
2035 - plt.plot(newx, sum(courbes), c=couleur, label=name_data) 2069 + plt.plot(newx, sum(curves), c=col, label=name_data)
2036 except TypeError: 2070 except TypeError:
2037 - print("N curves:", len(courbes)) 2071 + print("N curves:", len(curves))
2038 - for c in courbes: 2072 + for c in curves:
2039 print(c) 2073 print(c)
2040 plt.legend() 2074 plt.legend()
2041 2075
2042 # Save the json 2076 # Save the json
2043 - with open(runDir + "/results/geometry/json/" +name_data + " .json", 'w', encoding='utf-8') as f: 2077 + with open(runDir + "/results/geometry/json/" +name_data + ".json", 'w', encoding='utf-8') as f:
2044 json.dump(summary_data, f, indent=4) 2078 json.dump(summary_data, f, indent=4)
2045 2079
2046 @trace_unhandled_exceptions 2080 @trace_unhandled_exceptions
...@@ -2122,25 +2156,25 @@ def gmm_aa_dists(): ...@@ -2122,25 +2156,25 @@ def gmm_aa_dists():
2122 GMM_histo(c2p_o2p, "C2'-O2'") 2156 GMM_histo(c2p_o2p, "C2'-O2'")
2123 2157
2124 if len(op3_p) > 0 : 2158 if len(op3_p) > 0 :
2125 - GMM_histo(op3_p, "OP3-P", toric=False, hist=False, couleur= 'lightcoral') 2159 + GMM_histo(op3_p, "OP3-P", toric=False, hist=False, col= 'lightcoral')
2126 - GMM_histo(p_op1, "P-OP1", toric=False, hist=False, couleur='gold') 2160 + GMM_histo(p_op1, "P-OP1", toric=False, hist=False, col='gold')
2127 - GMM_histo(p_op2, "P-OP2", toric=False, hist=False, couleur='lightseagreen') 2161 + GMM_histo(p_op2, "P-OP2", toric=False, hist=False, col='lightseagreen')
2128 - GMM_histo(last_o3p_p, "O3'-P", toric=False, hist=False, couleur='saddlebrown') 2162 + GMM_histo(last_o3p_p, "O3'-P", toric=False, hist=False, col='saddlebrown')
2129 - GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, couleur='darkturquoise') 2163 + GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, col='darkturquoise')
2130 - GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, couleur='darkkhaki') 2164 + GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, col='darkkhaki')
2131 - GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, couleur='indigo') 2165 + GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, col='indigo')
2132 - GMM_histo(c4p_o4p, "C4'-O4'", toric=False, hist=False, couleur='maroon') 2166 + GMM_histo(c4p_o4p, "C4'-O4'", toric=False, hist=False, col='maroon')
2133 - GMM_histo(c4p_c3p, "C4'-C3'", toric=False, hist=False, couleur='burlywood') 2167 + GMM_histo(c4p_c3p, "C4'-C3'", toric=False, hist=False, col='burlywood')
2134 - GMM_histo(c3p_o3p, "C3'-O3'", toric=False, hist=False, couleur='steelblue') 2168 + GMM_histo(c3p_o3p, "C3'-O3'", toric=False, hist=False, col='steelblue')
2135 - GMM_histo(o4p_c1p, "O4'-C1'", toric=False, hist=False, couleur='tomato') 2169 + GMM_histo(o4p_c1p, "O4'-C1'", toric=False, hist=False, col='tomato')
2136 - GMM_histo(c1p_c2p, "C1'-C2'", toric=False, hist=False, couleur='darkolivegreen') 2170 + GMM_histo(c1p_c2p, "C1'-C2'", toric=False, hist=False, col='darkolivegreen')
2137 - GMM_histo(c2p_c3p, "C2'-C3'", toric=False, hist=False, couleur='orchid') 2171 + GMM_histo(c2p_c3p, "C2'-C3'", toric=False, hist=False, col='orchid')
2138 - GMM_histo(c2p_o2p, "C2'-O2'", toric=False, hist=False, couleur='deeppink') 2172 + GMM_histo(c2p_o2p, "C2'-O2'", toric=False, hist=False, col='deeppink')
2139 axes=plt.gca() 2173 axes=plt.gca()
2140 axes.set_ylim(0, 100) 2174 axes.set_ylim(0, 100)
2141 - plt.xlabel("Distance (Angström)") 2175 + plt.xlabel("Distance (Angströms)")
2142 - plt.title("GMM des distances entre atomes communs ") 2176 + plt.title("GMM of distances between common atoms ")
2143 - plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/commun/" + "GMM des distances entre atomes communs .png") 2177 + plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/commun/" + "GMM_distances_common_atoms.png")
2144 plt.close() 2178 plt.close()
2145 2179
2146 os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/purines/", exist_ok=True) 2180 os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/purines/", exist_ok=True)
...@@ -2161,25 +2195,25 @@ def gmm_aa_dists(): ...@@ -2161,25 +2195,25 @@ def gmm_aa_dists():
2161 GMM_histo(c4_n9, "C4-N9") 2195 GMM_histo(c4_n9, "C4-N9")
2162 GMM_histo(c4_c5, "C4-C5") 2196 GMM_histo(c4_c5, "C4-C5")
2163 2197
2164 - GMM_histo(c1p_n9, "C1'-N9", hist=False, couleur='lightcoral') 2198 + GMM_histo(c1p_n9, "C1'-N9", hist=False, col='lightcoral')
2165 - GMM_histo(n9_c8, "N9-C8", hist=False, couleur='gold') 2199 + GMM_histo(n9_c8, "N9-C8", hist=False, col='gold')
2166 - GMM_histo(c8_n7, "C8-N7", hist=False, couleur='lightseagreen') 2200 + GMM_histo(c8_n7, "C8-N7", hist=False, col='lightseagreen')
2167 - GMM_histo(n7_c5, "N7-C5", hist=False, couleur='saddlebrown') 2201 + GMM_histo(n7_c5, "N7-C5", hist=False, col='saddlebrown')
2168 - GMM_histo(c5_c6, "C5-C6", hist=False, couleur='darkturquoise') 2202 + GMM_histo(c5_c6, "C5-C6", hist=False, col='darkturquoise')
2169 - GMM_histo(c6_o6, "C6-O6", hist=False, couleur='darkkhaki') 2203 + GMM_histo(c6_o6, "C6-O6", hist=False, col='darkkhaki')
2170 - GMM_histo(c6_n6, "C6-N6", hist=False, couleur='indigo') 2204 + GMM_histo(c6_n6, "C6-N6", hist=False, col='indigo')
2171 - GMM_histo(c6_n1, "C6-N1", hist=False, couleur='maroon') 2205 + GMM_histo(c6_n1, "C6-N1", hist=False, col='maroon')
2172 - GMM_histo(n1_c2, "N1-C2", hist=False, couleur='burlywood') 2206 + GMM_histo(n1_c2, "N1-C2", hist=False, col='burlywood')
2173 - GMM_histo(c2_n2, "C2-N2", hist=False, couleur='steelblue') 2207 + GMM_histo(c2_n2, "C2-N2", hist=False, col='steelblue')
2174 - GMM_histo(c2_n3, "C2-N3", hist=False, couleur='tomato') 2208 + GMM_histo(c2_n3, "C2-N3", hist=False, col='tomato')
2175 - GMM_histo(n3_c4, "N3-C4", hist=False, couleur='darkolivegreen') 2209 + GMM_histo(n3_c4, "N3-C4", hist=False, col='darkolivegreen')
2176 - GMM_histo(c4_n9, "C4-N9", hist=False, couleur='orchid') 2210 + GMM_histo(c4_n9, "C4-N9", hist=False, col='orchid')
2177 - GMM_histo(c4_c5, "C4-C5", hist=False, couleur='deeppink') 2211 + GMM_histo(c4_c5, "C4-C5", hist=False, col='deeppink')
2178 axes=plt.gca() 2212 axes=plt.gca()
2179 axes.set_ylim(0, 100) 2213 axes.set_ylim(0, 100)
2180 - plt.xlabel("Distance (Angström)") 2214 + plt.xlabel("Distance (Angströms)")
2181 - plt.title("GMM des distances entre atomes des cycles purines", fontsize=10) 2215 + plt.title("GMM of distances between atoms of the purine cycles", fontsize=10)
2182 - plt.savefig(runDir+ "/results/figures/GMM/all-atoms/distances/purines/" + "GMM des distances entre atomes des cycles purines.png") 2216 + plt.savefig(runDir+ "/results/figures/GMM/all-atoms/distances/purines/" + "GMM_distances_purine_cycles.png")
2183 plt.close() 2217 plt.close()
2184 2218
2185 os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/pyrimidines/", exist_ok=True) 2219 os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/pyrimidines/", exist_ok=True)
...@@ -2197,22 +2231,22 @@ def gmm_aa_dists(): ...@@ -2197,22 +2231,22 @@ def gmm_aa_dists():
2197 GMM_histo(c4_n4, "C4-N4") 2231 GMM_histo(c4_n4, "C4-N4")
2198 GMM_histo(c4_o4, "C4-O4") 2232 GMM_histo(c4_o4, "C4-O4")
2199 2233
2200 - GMM_histo(c1p_n1, "C1'-N1", hist=False, couleur='lightcoral') 2234 + GMM_histo(c1p_n1, "C1'-N1", hist=False, col='lightcoral')
2201 - GMM_histo(n1_c6, "N1-C6", hist=False, couleur='gold') 2235 + GMM_histo(n1_c6, "N1-C6", hist=False, col='gold')
2202 - GMM_histo(c6_c5, "C6-C5", hist=False, couleur='lightseagreen') 2236 + GMM_histo(c6_c5, "C6-C5", hist=False, col='lightseagreen')
2203 - GMM_histo(c5_c4, "C5-C4", hist=False, couleur='deeppink') 2237 + GMM_histo(c5_c4, "C5-C4", hist=False, col='deeppink')
2204 - GMM_histo(c4_n3, "C4-N3", hist=False, couleur='red') 2238 + GMM_histo(c4_n3, "C4-N3", hist=False, col='red')
2205 - GMM_histo(n3_c2, "N3-C2", hist=False, couleur='lime') 2239 + GMM_histo(n3_c2, "N3-C2", hist=False, col='lime')
2206 - GMM_histo(c2_o2, "C2-O2", hist=False, couleur='indigo') 2240 + GMM_histo(c2_o2, "C2-O2", hist=False, col='indigo')
2207 - GMM_histo(c2_n1, "C2-N1", hist=False, couleur='maroon') 2241 + GMM_histo(c2_n1, "C2-N1", hist=False, col='maroon')
2208 - GMM_histo(c4_n4, "C4-N4", hist=False, couleur='burlywood') 2242 + GMM_histo(c4_n4, "C4-N4", hist=False, col='burlywood')
2209 - GMM_histo(c4_o4, "C4-O4", hist=False, couleur='steelblue') 2243 + GMM_histo(c4_o4, "C4-O4", hist=False, col='steelblue')
2210 axes=plt.gca() 2244 axes=plt.gca()
2211 #axes.set_xlim(1, 2) 2245 #axes.set_xlim(1, 2)
2212 axes.set_ylim(0, 100) 2246 axes.set_ylim(0, 100)
2213 - plt.xlabel("Distance (Angström)") 2247 + plt.xlabel("Distance (Angströms")
2214 - plt.title("GMM des distances entre atomes des cycles pyrimidines", fontsize=10) 2248 + plt.title("GMM of distances between atoms of the pyrimidine cycles", fontsize=10)
2215 - plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/pyrimidines/" + "GMM des distances entre atomes des cycles pyrimidines.png") 2249 + plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/pyrimidines/" + "GMM_distances_pyrimidine_cycles.png")
2216 plt.close() 2250 plt.close()
2217 2251
2218 os.chdir(runDir) 2252 os.chdir(runDir)
...@@ -2268,16 +2302,16 @@ def gmm_aa_torsions(): ...@@ -2268,16 +2302,16 @@ def gmm_aa_torsions():
2268 GMM_histo(zeta, "Zeta", toric=True) 2302 GMM_histo(zeta, "Zeta", toric=True)
2269 GMM_histo(chi, "Xhi", toric=True) 2303 GMM_histo(chi, "Xhi", toric=True)
2270 2304
2271 - GMM_histo(alpha, "Alpha", toric=True, hist=False, couleur='red') 2305 + GMM_histo(alpha, "Alpha", toric=True, hist=False, col='red')
2272 - GMM_histo(beta, "Beta", toric=True, hist=False, couleur='firebrick') 2306 + GMM_histo(beta, "Beta", toric=True, hist=False, col='firebrick')
2273 - GMM_histo(gamma, "Gamma", toric=True, hist=False, couleur='limegreen') 2307 + GMM_histo(gamma, "Gamma", toric=True, hist=False, col='limegreen')
2274 - GMM_histo(delta, "Delta", toric=True, hist=False, couleur='darkslateblue') 2308 + GMM_histo(delta, "Delta", toric=True, hist=False, col='darkslateblue')
2275 - GMM_histo(epsilon, "Epsilon", toric=True, hist=False, couleur='goldenrod') 2309 + GMM_histo(epsilon, "Epsilon", toric=True, hist=False, col='goldenrod')
2276 - GMM_histo(zeta, "Zeta", toric=True, hist=False, couleur='teal') 2310 + GMM_histo(zeta, "Zeta", toric=True, hist=False, col='teal')
2277 - GMM_histo(chi, "Xhi", toric=True, hist=False, couleur='hotpink') 2311 + GMM_histo(chi, "Xhi", toric=True, hist=False, col='hotpink')
2278 - plt.xlabel("Angle(Degré)") 2312 + plt.xlabel("Angle (Degrees)")
2279 - plt.title("GMM des angles de torsion") 2313 + plt.title("GMM of torsion angles")
2280 - plt.savefig("GMM des angles de torsion.png") 2314 + plt.savefig("GMM_torsions.png")
2281 plt.close() 2315 plt.close()
2282 2316
2283 os.chdir(runDir) 2317 os.chdir(runDir)
...@@ -2304,17 +2338,17 @@ def gmm_wadley(): ...@@ -2304,17 +2338,17 @@ def gmm_wadley():
2304 GMM_histo(p_c1p, "P-C4'") 2338 GMM_histo(p_c1p, "P-C4'")
2305 GMM_histo(c1p_p, "C4'-P") 2339 GMM_histo(c1p_p, "C4'-P")
2306 2340
2307 - GMM_histo(p_c1p, "P-C4'", toric=False, hist=False, couleur='gold') 2341 + GMM_histo(p_c1p, "P-C4'", toric=False, hist=False, col='gold')
2308 - GMM_histo(c1p_p, "C4'-P", toric=False, hist=False, couleur='indigo') 2342 + GMM_histo(c1p_p, "C4'-P", toric=False, hist=False, col='indigo')
2309 - GMM_histo(p_c1p, "P-C1'", toric=False, hist=False, couleur='firebrick') 2343 + GMM_histo(p_c1p, "P-C1'", toric=False, hist=False, col='firebrick')
2310 - GMM_histo(c1p_p, "C1'-P", toric=False, hist=False, couleur='seagreen') 2344 + GMM_histo(c1p_p, "C1'-P", toric=False, hist=False, col='seagreen')
2311 - plt.xlabel("Distance(Angström)") 2345 + plt.xlabel("Distance (Angströms)")
2312 - plt.title("GMM des distances (Pyle model)") 2346 + plt.title("GMM of distances (Pyle model)")
2313 - plt.savefig("GMM des distances (Pyle model).png") 2347 + plt.savefig("GMM_distances_pyle_model.png")
2314 plt.close() 2348 plt.close()
2315 2349
2316 # Flat Angles 2350 # Flat Angles
2317 - df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/angles/angles_plans_wadley.csv")) 2351 + df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/angles/flat_angles_pyle.csv"))
2318 2352
2319 p_c1p_psuiv = list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])]) 2353 p_c1p_psuiv = list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])])
2320 c1p_psuiv_c1psuiv = list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])]) 2354 c1p_psuiv_c1psuiv = list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])])
...@@ -2326,11 +2360,11 @@ def gmm_wadley(): ...@@ -2326,11 +2360,11 @@ def gmm_wadley():
2326 GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True) 2360 GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True)
2327 GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True) 2361 GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True)
2328 2362
2329 - GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True, hist=False, couleur='firebrick') 2363 + GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True, hist=False, col='firebrick')
2330 - GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True, hist=False, couleur='seagreen') 2364 + GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True, hist=False, col='seagreen')
2331 - plt.xlabel("Angle(Degré)") 2365 + plt.xlabel("Angle (Degrees)")
2332 - plt.title("GMM des angles plans (Pyle model)") 2366 + plt.title("GMM of flat angles (Pyle model)")
2333 - plt.savefig("GMM des angles plans (Pyle model).png") 2367 + plt.savefig("GMM_flat_angles_pyle_model.png")
2334 plt.close() 2368 plt.close()
2335 2369
2336 # Torsion anfles 2370 # Torsion anfles
...@@ -2367,15 +2401,15 @@ def gmm_wadley(): ...@@ -2367,15 +2401,15 @@ def gmm_wadley():
2367 GMM_histo(eta_base, "Eta''", toric=True) 2401 GMM_histo(eta_base, "Eta''", toric=True)
2368 GMM_histo(theta_base, "Theta''", toric=True) 2402 GMM_histo(theta_base, "Theta''", toric=True)
2369 2403
2370 - GMM_histo(eta, "Eta", toric=True, hist=False, couleur='mediumaquamarine') 2404 + GMM_histo(eta, "Eta", toric=True, hist=False, col='mediumaquamarine')
2371 - GMM_histo(theta, "Theta", toric=True, hist=False, couleur='darkorchid') 2405 + GMM_histo(theta, "Theta", toric=True, hist=False, col='darkorchid')
2372 - GMM_histo(eta_prime, "Eta'", toric=True, hist=False, couleur='cyan') 2406 + GMM_histo(eta_prime, "Eta'", toric=True, hist=False, col='cyan')
2373 - GMM_histo(theta_prime, "Theta'", toric=True, hist=False, couleur='crimson') 2407 + GMM_histo(theta_prime, "Theta'", toric=True, hist=False, col='crimson')
2374 - GMM_histo(eta_base, "Eta''", toric=True, hist=False, couleur='royalblue') 2408 + GMM_histo(eta_base, "Eta''", toric=True, hist=False, col='royalblue')
2375 - GMM_histo(theta_base, "Theta''", toric=True, hist=False, couleur='palevioletred') 2409 + GMM_histo(theta_base, "Theta''", toric=True, hist=False, col='palevioletred')
2376 - plt.xlabel("Angle(Degré)") 2410 + plt.xlabel("Angle (Degrees)")
2377 - plt.title("GMM des angles de pseudotorsion") 2411 + plt.title("GMM of pseudo-torsion angles (Pyle Model)")
2378 - plt.savefig("GMM des angles de pseudotorsion.png") 2412 + plt.savefig("GMM_pseudotorsion_angles_pyle_model.png")
2379 plt.close() 2413 plt.close()
2380 2414
2381 os.chdir(runDir) 2415 os.chdir(runDir)
...@@ -2411,18 +2445,18 @@ def gmm_hrna(): ...@@ -2411,18 +2445,18 @@ def gmm_hrna():
2411 GMM_histo(p_o5p, "P-O5'") 2445 GMM_histo(p_o5p, "P-O5'")
2412 GMM_histo(last_c4p_p, "C4'-P") 2446 GMM_histo(last_c4p_p, "C4'-P")
2413 2447
2414 - GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, couleur='lightcoral') 2448 + GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, col='lightcoral')
2415 - GMM_histo(b1_b2, "B1-B2", toric=False, hist=False, couleur='limegreen') 2449 + GMM_histo(b1_b2, "B1-B2", toric=False, hist=False, col='limegreen')
2416 - GMM_histo(c1p_b1, "C1'-B1", toric=False, hist=False, couleur='tomato') 2450 + GMM_histo(c1p_b1, "C1'-B1", toric=False, hist=False, col='tomato')
2417 - GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, couleur='aquamarine') 2451 + GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, col='aquamarine')
2418 - GMM_histo(c4p_c1p, "C4'-C1'", toric=False, hist=False, couleur='goldenrod') 2452 + GMM_histo(c4p_c1p, "C4'-C1'", toric=False, hist=False, col='goldenrod')
2419 - GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, couleur='darkcyan') 2453 + GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, col='darkcyan')
2420 - GMM_histo(last_c4p_p, "C4'-P", toric=False, hist=False, couleur='deeppink') 2454 + GMM_histo(last_c4p_p, "C4'-P", toric=False, hist=False, col='deeppink')
2421 axes = plt.gca() 2455 axes = plt.gca()
2422 axes.set_ylim(0, 100) 2456 axes.set_ylim(0, 100)
2423 - plt.xlabel("Distance (Angström)") 2457 + plt.xlabel("Distance (Angströms)")
2424 - plt.title("GMM des distances entre atomes HiRE-RNA") 2458 + plt.title("GMM of distances between HiRE-RNA beads")
2425 - plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/distances/GMM des distances entre atomes HiRE-RNA.png") 2459 + plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/distances/GMM_distances_HiRE_RNA.png")
2426 plt.close() 2460 plt.close()
2427 2461
2428 # Angles 2462 # Angles
...@@ -2449,19 +2483,19 @@ def gmm_hrna(): ...@@ -2449,19 +2483,19 @@ def gmm_hrna():
2449 GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True) 2483 GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True)
2450 GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True) 2484 GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True)
2451 2485
2452 - GMM_histo(lastc4p_p_o5p, "C4'-P-O5'", toric=True, hist=False, couleur='lightcoral') 2486 + GMM_histo(lastc4p_p_o5p, "C4'-P-O5'", toric=True, hist=False, col='lightcoral')
2453 - GMM_histo(lastc1p_lastc4p_p, "C1'-C4'-P", toric=True, hist=False, couleur='limegreen') 2487 + GMM_histo(lastc1p_lastc4p_p, "C1'-C4'-P", toric=True, hist=False, col='limegreen')
2454 - GMM_histo(lastc5p_lastc4p_p, "C5'-C4'-P", toric=True, hist=False, couleur='tomato') 2488 + GMM_histo(lastc5p_lastc4p_p, "C5'-C4'-P", toric=True, hist=False, col='tomato')
2455 - GMM_histo(p_o5p_c5p, "P-O5'-C5'", toric=True, hist=False, couleur='aquamarine') 2489 + GMM_histo(p_o5p_c5p, "P-O5'-C5'", toric=True, hist=False, col='aquamarine')
2456 - GMM_histo(o5p_c5p_c4p, "O5'-C5'-C4'", toric=True, hist=False, couleur='goldenrod') 2490 + GMM_histo(o5p_c5p_c4p, "O5'-C5'-C4'", toric=True, hist=False, col='goldenrod')
2457 - GMM_histo(c5p_c4p_c1p, "C5'-C4'-C1'", toric=True, hist=False, couleur='darkcyan') 2491 + GMM_histo(c5p_c4p_c1p, "C5'-C4'-C1'", toric=True, hist=False, col='darkcyan')
2458 - GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True, hist=False, couleur='deeppink') 2492 + GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True, hist=False, col='deeppink')
2459 - GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True, hist=False, couleur='indigo') 2493 + GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True, hist=False, col='indigo')
2460 axes = plt.gca() 2494 axes = plt.gca()
2461 axes.set_ylim(0, 100) 2495 axes.set_ylim(0, 100)
2462 - plt.xlabel("Angle (Degré)") 2496 + plt.xlabel("Angle (Degres)")
2463 - plt.title("GMM des angles entre atomes HiRE-RNA") 2497 + plt.title("GMM of angles between HiRE-RNA beads")
2464 - plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/angles/GMM des angles entre atomes HiRE-RNA.png") 2498 + plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/angles/GMM_angles_HiRE_RNA.png")
2465 plt.close() 2499 plt.close()
2466 2500
2467 # Torsions 2501 # Torsions
...@@ -2488,24 +2522,24 @@ def gmm_hrna(): ...@@ -2488,24 +2522,24 @@ def gmm_hrna():
2488 GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True) 2522 GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True)
2489 GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True) 2523 GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True)
2490 2524
2491 - GMM_histo(p_o5_c5_c4, "P-O5'-C5'-C4'", toric=True, hist=False, couleur='darkred') 2525 + GMM_histo(p_o5_c5_c4, "P-O5'-C5'-C4'", toric=True, hist=False, col='darkred')
2492 - GMM_histo(o5_c5_c4_c1, "O5'-C5'-C4'-C1'", toric=True, hist=False, couleur='chocolate') 2526 + GMM_histo(o5_c5_c4_c1, "O5'-C5'-C4'-C1'", toric=True, hist=False, col='chocolate')
2493 - GMM_histo(c5_c4_c1_b1, "C5'-C4'-C1'-B1", toric=True, hist=False, couleur='mediumvioletred') 2527 + GMM_histo(c5_c4_c1_b1, "C5'-C4'-C1'-B1", toric=True, hist=False, col='mediumvioletred')
2494 - GMM_histo(c4_c1_b1_b2, "C4'-C1'-B1-B2", toric=True, hist=False, couleur='cadetblue') 2528 + GMM_histo(c4_c1_b1_b2, "C4'-C1'-B1-B2", toric=True, hist=False, col='cadetblue')
2495 - GMM_histo(o5_c5_c4_psuiv, "O5'-C5'-C4'-P°", toric=True, hist=False, couleur='darkkhaki') 2529 + GMM_histo(o5_c5_c4_psuiv, "O5'-C5'-C4'-P°", toric=True, hist=False, col='darkkhaki')
2496 - GMM_histo(c5_c4_psuiv_o5suiv, "C5'-C4'-P°-O5'°", toric=True, hist=False, couleur='springgreen') 2530 + GMM_histo(c5_c4_psuiv_o5suiv, "C5'-C4'-P°-O5'°", toric=True, hist=False, col='springgreen')
2497 - GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True, hist=False, couleur='indigo') 2531 + GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True, hist=False, col='indigo')
2498 - GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True, hist=False, couleur='gold') 2532 + GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True, hist=False, col='gold')
2499 - plt.xlabel("Angle(Degré)") 2533 + plt.xlabel("Angle (Degrees)")
2500 - plt.title("GMM des angles de torsion (hire-RNA)") 2534 + plt.title("GMM of torsion angles between HiRE-RNA beads")
2501 - plt.savefig("GMM des angles de torsion (hire-RNA).png") 2535 + plt.savefig("GMM_torsions_HiRE_RNA.png")
2502 plt.close() 2536 plt.close()
2503 2537
2504 os.chdir(runDir) 2538 os.chdir(runDir)
2505 setproctitle("GMM (HiRE-RNA) finished") 2539 setproctitle("GMM (HiRE-RNA) finished")
2506 2540
2507 @trace_unhandled_exceptions 2541 @trace_unhandled_exceptions
2508 -def gmm_hrna_basepair_type(type_LW, angle_1, angle_2, angle_3, angle_4, distance): 2542 +def gmm_hrna_basepair_type(type_LW, ntpair, data):
2509 """ 2543 """
2510 function to plot the statistical figures you want 2544 function to plot the statistical figures you want
2511 By type of pairing: 2545 By type of pairing:
...@@ -2520,196 +2554,116 @@ def gmm_hrna_basepair_type(type_LW, angle_1, angle_2, angle_3, angle_4, distance ...@@ -2520,196 +2554,116 @@ def gmm_hrna_basepair_type(type_LW, angle_1, angle_2, angle_3, angle_4, distance
2520 plt.gcf().subplots_adjust(left = 0.1, bottom = 0.1, right = 0.9, top = 0.9, wspace = 0, hspace = 0.5) 2554 plt.gcf().subplots_adjust(left = 0.1, bottom = 0.1, right = 0.9, top = 0.9, wspace = 0, hspace = 0.5)
2521 2555
2522 plt.subplot(2, 1, 1) 2556 plt.subplot(2, 1, 1)
2523 - 2557 + GMM_histo(data["211_angle"], f"{type_LW}_{ntpair}_C1'-B1-B1pair", toric=True, hist=False, col='cyan' )
2524 - if len(angle_1) > 0 : 2558 + GMM_histo(data["112_angle"], f"{type_LW}_{ntpair}_B1-B1pair-C1'pair", toric=True, hist=False, col='magenta')
2525 - GMM_histo(angle_1, "C4'-C1'-B1", toric=True, hist=False, couleur='cyan' ) 2559 + GMM_histo(data["3211_torsion"], f"{type_LW}_{ntpair}_C4'-C1'-B1-B1pair", toric=True, hist=False, col='black' )
2526 - if len(angle_2) > 0 : 2560 + GMM_histo(data["1123_torsion"], f"{type_LW}_{ntpair}_B1-B1pair-C1'pair-C4'pair", toric=True, hist=False, col='maroon')
2527 - GMM_histo(angle_2, "C1'-B1-B1pair", toric=True, hist=False, couleur='magenta') 2561 + GMM_histo(data["alpha1"], f"{type_LW}_{ntpair}_alpha_1", toric=True, hist=False, col="yellow")
2528 - if len(angle_3) > 0 : 2562 + GMM_histo(data["alpha2"], f"{type_LW}_{ntpair}_alpha_2", toric=True, hist=False, col='olive')
2529 - GMM_histo(angle_3, "B1-B1pair-C1'pair", toric=True, hist=False, couleur="yellow") 2563 + plt.xlabel("Angle (degree)")
2530 - if len(angle_4) > 0 : 2564 + plt.title(f"GMM of plane angles for {type_LW} {ntpair} basepairs", fontsize=10)
2531 - GMM_histo(angle_4, "B1pair-C1'pair-C4'pair", toric=True, hist=False, couleur='olive')
2532 - plt.xlabel("Angle(degré)")
2533 - plt.title("GMM des angles plans pour les measure_hrna_basepairs " +type_LW , fontsize=10)
2534 2565
2535 plt.subplot(2, 1, 2) 2566 plt.subplot(2, 1, 2)
2536 - if len(distance)>0 : 2567 + GMM_histo(data["Distance"], f"Distance between {type_LW} {ntpair} tips", toric=False, hist=False, col="cyan")
2537 - GMM_histo(distance, "Distance pointes " + type_LW, save=False) 2568 + GMM_histo(data["dB1"], f"{type_LW} {ntpair} dB1", toric=False, hist=False, col="tomato")
2538 - 2569 + GMM_histo(data["dB2"], f"{type_LW} {ntpair} dB2", toric=False, hist=False, col="goldenrod")
2539 - plt.savefig("Mesures measure_hrna_basepairs " +type_LW+ ".png" ) 2570 + plt.xlabel("Distance (Angströms)")
2571 + plt.title(f"GMM of distances for {type_LW} {ntpair} basepairs", fontsize=10)
2572 +
2573 + plt.savefig(f"{type_LW}_{ntpair}_basepairs.png" )
2540 plt.close() 2574 plt.close()
2541 - setproctitle(f"GMM (HiRE-RNA {type_LW} basepairs) finished") 2575 + setproctitle(f"GMM (HiRE-RNA {type_LW} {ntpair} basepairs) finished")
2542 2576
2543 @trace_unhandled_exceptions 2577 @trace_unhandled_exceptions
2544 def gmm_hrna_basepairs(): 2578 def gmm_hrna_basepairs():
2545 2579
2546 setproctitle("GMM (HiRE-RNA basepairs)") 2580 setproctitle("GMM (HiRE-RNA basepairs)")
2547 2581
2548 - df=pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs.csv")) 2582 + df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs.csv"))
2549 - 2583 +
2550 - cWW=df[df['type LW']=='cWW'] 2584 + lw = ["cWW", "tWW", "cWH", "tWH", "cHW", "tHW", "cWS", "tWS", "cSW", "tSW", "cHH", "tHH", "cSH", "tSH", "cHS", "tHS", "cSS", "tSS"]
2551 - cWW_dist=list(cWW["Distance"])
2552 - cWW_angle_1=list(cWW["C4'-C1'-B1"])
2553 - cWW_angle_2=list(cWW["C1'-B1-B1pair"])
2554 - cWW_angle_3=list(cWW["B1-B1pair-C1'pair"])
2555 - cWW_angle_4=list(cWW["B1pair-C1'pair-C4'pair"])
2556 - tWW=df[df['type LW']=='tWW']
2557 - tWW_dist=list(tWW["Distance"])
2558 - tWW_angle_1=list(tWW["C4'-C1'-B1"])
2559 - tWW_angle_2=list(tWW["C1'-B1-B1pair"])
2560 - tWW_angle_3=list(tWW["B1-B1pair-C1'pair"])
2561 - tWW_angle_4=list(tWW["B1pair-C1'pair-C4'pair"])
2562 - cWH=df[df['type LW']=='cWH']
2563 - cWH_dist=list(cWH["Distance"])
2564 - cWH_angle_1=list(cWH["C4'-C1'-B1"])
2565 - cWH_angle_2=list(cWH["C1'-B1-B1pair"])
2566 - cWH_angle_3=list(cWH["B1-B1pair-C1'pair"])
2567 - cWH_angle_4=list(cWH["B1pair-C1'pair-C4'pair"])
2568 - tWH=df[df['type LW']=='tWH']
2569 - tWH_dist=list(tWH["Distance"])
2570 - tWH_angle_1=list(tWH["C4'-C1'-B1"])
2571 - tWH_angle_2=list(tWH["C1'-B1-B1pair"])
2572 - tWH_angle_3=list(tWH["B1-B1pair-C1'pair"])
2573 - tWH_angle_4=list(tWH["B1pair-C1'pair-C4'pair"])
2574 - cHW=df[df['type LW']=='cHW']
2575 - cHW_dist=list(cHW["Distance"])
2576 - cHW_angle_1=list(cHW["C4'-C1'-B1"])
2577 - cHW_angle_2=list(cHW["C1'-B1-B1pair"])
2578 - cHW_angle_3=list(cHW["B1-B1pair-C1'pair"])
2579 - cHW_angle_4=list(cHW["B1pair-C1'pair-C4'pair"])
2580 - tHW=df[df['type LW']=='tHW']
2581 - tHW_dist=list(tHW["Distance"])
2582 - tHW_angle_1=list(tHW["C4'-C1'-B1"])
2583 - tHW_angle_2=list(tHW["C1'-B1-B1pair"])
2584 - tHW_angle_3=list(tHW["B1-B1pair-C1'pair"])
2585 - tHW_angle_4=list(tHW["B1pair-C1'pair-C4'pair"])
2586 - cWS=df[df['type LW']=='cWS']
2587 - cWS_dist=list(cWS["Distance"])
2588 - cWS_angle_1=list(cWS["C4'-C1'-B1"])
2589 - cWS_angle_2=list(cWS["C1'-B1-B1pair"])
2590 - cWS_angle_3=list(cWS["B1-B1pair-C1'pair"])
2591 - cWS_angle_4=list(cWS["B1pair-C1'pair-C4'pair"])
2592 - tWS=df[df['type LW']=='tWS']
2593 - tWS_dist=list(tWS["Distance"])
2594 - tWS_angle_1=list(tWS["C4'-C1'-B1"])
2595 - tWS_angle_2=list(tWS["C1'-B1-B1pair"])
2596 - tWS_angle_3=list(tWS["B1-B1pair-C1'pair"])
2597 - tWS_angle_4=list(tWS["B1pair-C1'pair-C4'pair"])
2598 - cSW=df[df['type LW']=='cSW']
2599 - cSW_dist=list(cSW["Distance"])
2600 - cSW_angle_1=list(cSW["C4'-C1'-B1"])
2601 - cSW_angle_2=list(cSW["C1'-B1-B1pair"])
2602 - cSW_angle_3=list(cSW["B1-B1pair-C1'pair"])
2603 - cSW_angle_4=list(cSW["B1pair-C1'pair-C4'pair"])
2604 - tSW=df[df['type LW']=='tSW']
2605 - tSW_dist=list(tSW["Distance"])
2606 - tSW_angle_1=list(tSW["C4'-C1'-B1"])
2607 - tSW_angle_2=list(tSW["C1'-B1-B1pair"])
2608 - tSW_angle_3=list(tSW["B1-B1pair-C1'pair"])
2609 - tSW_angle_4=list(tSW["B1pair-C1'pair-C4'pair"])
2610 - cHH=df[df['type LW']=='cHH']
2611 - cHH_dist=list(cHH["Distance"])
2612 - cHH_angle_1=list(cHH["C4'-C1'-B1"])
2613 - cHH_angle_2=list(cHH["C1'-B1-B1pair"])
2614 - cHH_angle_3=list(cHH["B1-B1pair-C1'pair"])
2615 - cHH_angle_4=list(cHH["B1pair-C1'pair-C4'pair"])
2616 - tHH=df[df['type LW']=='tHH']
2617 - tHH_dist=list(tHH["Distance"])
2618 - tHH_angle_1=list(tHH["C4'-C1'-B1"])
2619 - tHH_angle_2=list(tHH["C1'-B1-B1pair"])
2620 - tHH_angle_3=list(tHH["B1-B1pair-C1'pair"])
2621 - tHH_angle_4=list(tHH["B1pair-C1'pair-C4'pair"])
2622 - cSH=df[df['type LW']=='cSH']
2623 - cSH_dist=list(cSH["Distance"])
2624 - cSH_angle_1=list(cSH["C4'-C1'-B1"])
2625 - cSH_angle_2=list(cSH["C1'-B1-B1pair"])
2626 - cSH_angle_3=list(cSH["B1-B1pair-C1'pair"])
2627 - cSH_angle_4=list(cSH["B1pair-C1'pair-C4'pair"])
2628 - tSH=df[df['type LW']=='tSH']
2629 - tSH_dist=list(tSH["Distance"])
2630 - tSH_angle_1=list(tSH["C4'-C1'-B1"])
2631 - tSH_angle_2=list(tSH["C1'-B1-B1pair"])
2632 - tSH_angle_3=list(tSH["B1-B1pair-C1'pair"])
2633 - tSH_angle_4=list(tSH["B1pair-C1'pair-C4'pair"])
2634 - cHS=df[df['type LW']=='cHS']
2635 - cHS_dist=list(cHS["Distance"])
2636 - cHS_angle_1=list(cHS["C4'-C1'-B1"])
2637 - cHS_angle_2=list(cHS["C1'-B1-B1pair"])
2638 - cHS_angle_3=list(cHS["B1-B1pair-C1'pair"])
2639 - cHS_angle_4=list(cHS["B1pair-C1'pair-C4'pair"])
2640 - tHS=df[df['type LW']=='tHS']
2641 - tHS_dist=list(tHS["Distance"])
2642 - tHS_angle_1=list(tHS["C4'-C1'-B1"])
2643 - tHS_angle_2=list(tHS["C1'-B1-B1pair"])
2644 - tHS_angle_3=list(tHS["B1-B1pair-C1'pair"])
2645 - tHS_angle_4=list(tHS["B1pair-C1'pair-C4'pair"])
2646 - cSS=df[df['type LW']=='cSS']
2647 - cSS_dist=list(cSS["Distance"])
2648 - cSS_angle_1=list(cSS["C4'-C1'-B1"])
2649 - cSS_angle_2=list(cSS["C1'-B1-B1pair"])
2650 - cSS_angle_3=list(cSS["B1-B1pair-C1'pair"])
2651 - cSS_angle_4=list(cSS["B1pair-C1'pair-C4'pair"])
2652 - tSS=df[df['type LW']=='tSS']
2653 - tSS_dist=list(tSS["Distance"])
2654 - tSS_angle_1=list(tSS["C4'-C1'-B1"])
2655 - tSS_angle_2=list(tSS["C1'-B1-B1pair"])
2656 - tSS_angle_3=list(tSS["B1-B1pair-C1'pair"])
2657 - tSS_angle_4=list(tSS["B1pair-C1'pair-C4'pair"])
2658 2585
2659 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True) 2586 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True)
2660 os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/") 2587 os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/")
2661 2588
2662 - gmm_hrna_basepair_type('cWW', cWW_angle_1, cWW_angle_2, cWW_angle_3, cWW_angle_4, cWW_dist) 2589 + for lw_type in lw:
2663 - gmm_hrna_basepair_type('tWW', tWW_angle_1, tWW_angle_2, tWW_angle_3, tWW_angle_4, tWW_dist) 2590 + data = df[df['type_LW'] == lw_type ]
2664 - gmm_hrna_basepair_type('cWH', cWH_angle_1, cWH_angle_2, cWH_angle_3, cWH_angle_4, cWH_dist) 2591 + if len(data):
2665 - gmm_hrna_basepair_type('tWH', tWH_angle_1, tWH_angle_2, tWH_angle_3, tWH_angle_4, tWH_dist) 2592 + for b1 in ['A','C','G','U']:
2666 - gmm_hrna_basepair_type('cHW', cHW_angle_1, cHW_angle_2, cHW_angle_3, cHW_angle_4, cHW_dist) 2593 + for b2 in ['A','C','G','U']:
2667 - gmm_hrna_basepair_type('tHW', tHW_angle_1, tHW_angle_2, tHW_angle_3, tHW_angle_4, tHW_dist) 2594 + thisbases = data[(data.nt1_res == b1)&(data.nt2_res == b2)]
2668 - gmm_hrna_basepair_type('tWS', tWS_angle_1, tWS_angle_2, tWS_angle_3, tWS_angle_4, tWS_dist) 2595 + if len(thisbases):
2669 - gmm_hrna_basepair_type('cWS', cWS_angle_1, cWS_angle_2, cWS_angle_3, cWS_angle_4, cWS_dist) 2596 + gmm_hrna_basepair_type(lw_type, b1+b2, thisbases)
2670 - gmm_hrna_basepair_type('tSW', tSW_angle_1, tSW_angle_2, tSW_angle_3, tSW_angle_4, tSW_dist) 2597 +
2671 - gmm_hrna_basepair_type('cSW', cSW_angle_1, cSW_angle_2, cSW_angle_3, cSW_angle_4, cSW_dist) 2598 + # colors = ['lightcoral', "lightseagreen", "black", "goldenrod", "olive", "steelblue", "silver", "deeppink", "navy",
2672 - gmm_hrna_basepair_type('cHH', cHH_angle_1, cHH_angle_2, cHH_angle_3, cHH_angle_4, cHH_dist) 2599 + # "sienna", "maroon", "orange", "mediumaquamarine", "tomato", "indigo", "orchid", "tan", "lime"]
2673 - gmm_hrna_basepair_type('tHH', tHH_angle_1, tHH_angle_2, tHH_angle_3, tHH_angle_4, tHH_dist) 2600 + # for lw_type, col in zip(lw, colors):
2674 - gmm_hrna_basepair_type('cSH', cSH_angle_1, cSH_angle_2, cSH_angle_3, cSH_angle_4, cSH_dist) 2601 + # data = df[df['type LW'] == lw_type]
2675 - gmm_hrna_basepair_type('tSH', tSH_angle_1, tSH_angle_2, tSH_angle_3, tSH_angle_4, tSH_dist) 2602 + # GMM_histo(data.Distance, lw_type, toric=False, hist=False, col=col)
2676 - gmm_hrna_basepair_type('cHS', cHS_angle_1, cHS_angle_2, cHS_angle_3, cHS_angle_4, cHS_dist) 2603 + # plt.xlabel('Distance (Angströms)')
2677 - gmm_hrna_basepair_type('tHS', tHS_angle_1, tHS_angle_2, tHS_angle_3, tHS_angle_4, tHS_dist) 2604 + # plt.title("GMM of distances between base tips ("+str(nt)+ " values)", fontsize=8)
2678 - gmm_hrna_basepair_type('cSS', cSS_angle_1, cSS_angle_2, cSS_angle_3, cSS_angle_4, cSS_dist) 2605 + # plt.savefig("distances_between_tips.png")
2679 - gmm_hrna_basepair_type('tSS', tSS_angle_1, tSS_angle_2, tSS_angle_3, tSS_angle_4, tSS_dist) 2606 + # plt.close()
2680 -
2681 - nc=len(cWW)+len(cHH)+len(cSS)+len(cWH)+len(cHW)+len(cWS)+len(cSW)+len(cHS)+len(cSH)
2682 - GMM_histo(cWW_dist, "cWW", toric=False, hist=False, couleur='lightcoral')
2683 - GMM_histo(cHH_dist, "cHH", toric=False, hist=False, couleur='lightseagreen')
2684 - GMM_histo(cSS_dist, "cSS", toric=False, hist=False, couleur='black')
2685 - GMM_histo(cWH_dist, "cWH", toric=False, hist=False, couleur='goldenrod')
2686 - GMM_histo(cHW_dist, "cHW", toric=False, hist=False, couleur='olive')
2687 - GMM_histo(cWS_dist, "cWS", toric=False, hist=False, couleur='steelblue')
2688 - GMM_histo(cSW_dist, "cSW", toric=False, hist=False, couleur='silver')
2689 - GMM_histo(cHS_dist, "cHS", toric=False, hist=False, couleur='deeppink')
2690 - GMM_histo(cSH_dist, "cSH", toric=False, hist=False, couleur='navy')
2691 - plt.xlabel('Distance (Angström)')
2692 - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis ("+str(nc)+ " valeurs)", fontsize=8)
2693 - plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs cis (" +str(nc)+ " valeurs).png")
2694 - plt.close()
2695 -
2696 - nt=len(tWW)+len(tHH)+len(tSS)+len(tWH)+len(tHW)+len(tWS)+len(tSW)+len(tHS)+len(tSH)
2697 - GMM_histo(tWW_dist, "tWW", toric=False, hist=False, couleur='sienna')
2698 - GMM_histo(tHH_dist, "tHH", toric=False, hist=False, couleur='maroon')
2699 - GMM_histo(tSS_dist, "tSS", toric=False, hist=False, couleur='orange')
2700 - GMM_histo(tWH_dist, "tWH", toric=False, hist=False, couleur='mediumaquamarine')
2701 - GMM_histo(tHW_dist, "tHW", toric=False, hist=False, couleur='tomato')
2702 - GMM_histo(tWS_dist, "tWS", toric=False, hist=False, couleur='indigo')
2703 - GMM_histo(tSW_dist, "tSW", toric=False, hist=False, couleur='orchid')
2704 - GMM_histo(tHS_dist, "tHS", toric=False, hist=False, couleur='tan')
2705 - GMM_histo(tSH_dist, "tSH", toric=False, hist=False, couleur='lime')
2706 - plt.xlabel('Distance (Angström)')
2707 - plt.title("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans ("+str(nt)+ " valeurs)", fontsize=8)
2708 - plt.savefig("GMM des distances entre pointes des nucléotides pour les measure_hrna_basepairs trans (" +str(nt)+ " valeurs).png")
2709 - plt.close()
2710 2607
2711 os.chdir(runDir) 2608 os.chdir(runDir)
2712 setproctitle(f"GMM (HiRE-RNA basepairs) finished") 2609 setproctitle(f"GMM (HiRE-RNA basepairs) finished")
2610 +
2611 +def merge_jsons():
2612 + # All atom distances
2613 + bonds = ["O3'-P", "OP3-P", "P-OP1", "P-OP2", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-O4'", "C4'-C3'", "O4'-C1'", "C1'-C2'", "C2'-O2'", "C2'-C3'", "C3'-O3'", "C1'-N9",
2614 + "N9-C8", "C8-N7", "N7-C5", "C5-C6", "C6-O6", "C6-N6", "C6-N1", "N1-C2", "C2-N2", "C2-N3", "N3-C4", "C4-N9", "C4-C5",
2615 + "C1'-N1", "N1-C6", "C6-C5", "C5-C4", "C4-N3", "N3-C2", "C2-O2", "C2-N1", "C4-N4", "C4-O4"]
2616 + bonds = [ runDir + "/results/geometry/json/" + x + ".json" for x in bonds ]
2617 + concat_jsons(bonds, runDir + "/results/geometry/json/all_atom_distances.json")
2618 +
2619 +
2620 + # All atom torsions
2621 + torsions = ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Xhi", "Zeta"]
2622 + torsions = [ runDir + "/results/geometry/json/" + x + ".json" for x in torsions ]
2623 + concat_jsons(torsions, runDir + "/results/geometry/json/all_atom_torsions.json")
2624 +
2625 + # HiRE-RNA distances
2626 + hrnabonds = ["P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2", "C4'-P"]
2627 + hrnabonds = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnabonds ]
2628 + concat_jsons(hrnabonds, runDir + "/results/geometry/json/hirerna_distances.json")
2629 +
2630 + # HiRE-RNA angles
2631 + hrnaangles = ["P-O5'-C5'", "O5'-C5'-C4'", "C5'-C4'-C1'", "C4'-C1'-B1", "C1'-B1-B2", "C4'-P-O5'", "C5'-C4'-P", "C1'-C4'-P"]
2632 + hrnaangles = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnaangles ]
2633 + concat_jsons(hrnaangles, runDir + "/results/geometry/json/hirerna_angles.json")
2634 +
2635 + # HiRE-RNA torsions
2636 + hrnators = ["P-O5'-C5'-C4'", "O5'-C5'-C4'-C1'", "C5'-C4'-C1'-B1", "C4'-C1'-B1-B2", "C4'-P°-O5'°-C5'°", "C5'-C4'-P°-O5'°", "C1'-C4'-P°-O5'°", "O5'-C5'-C4'-P°"]
2637 + hrnators = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnators ]
2638 + concat_jsons(hrnators, runDir + "/results/geometry/json/hirerna_torsions.json")
2639 +
2640 + # HiRE-RNA basepairs
2641 + for nt1 in ['A', 'C', 'G', 'U']:
2642 + for nt2 in ['A', 'C', 'G', 'U']:
2643 + bps = glob.glob(runDir + f"/results/geometry/json/*{nt1}{nt2}*.json")
2644 + concat_jsons(bps, runDir + f"/results/geometry/json/hirerna_{nt1}{nt2}_basepairs.json")
2645 +
2646 + # Delete previous files
2647 + for f in bonds + torsions + hrnabonds + hrnaangles + hrnators:
2648 + try:
2649 + os.remove(f)
2650 + except FileNotFoundError:
2651 + pass
2652 + for f in glob.glob(runDir + "/results/geometry/json/t*.json"):
2653 + try:
2654 + os.remove(f)
2655 + except FileNotFoundError:
2656 + pass
2657 + for f in glob.glob(runDir + "/results/geometry/json/c*.json"):
2658 + try:
2659 + os.remove(f)
2660 + except FileNotFoundError:
2661 + pass
2662 + for f in glob.glob(runDir + "/results/geometry/json/Distance*.json"):
2663 + try:
2664 + os.remove(f)
2665 + except FileNotFoundError:
2666 + pass
2713 2667
2714 @trace_unhandled_exceptions 2668 @trace_unhandled_exceptions
2715 def concat_dataframes(fpath, outfilename): 2669 def concat_dataframes(fpath, outfilename):
...@@ -2735,6 +2689,23 @@ def concat_dataframes(fpath, outfilename): ...@@ -2735,6 +2689,23 @@ def concat_dataframes(fpath, outfilename):
2735 idxQueue.put(thr_idx) # replace the thread index in the queue 2689 idxQueue.put(thr_idx) # replace the thread index in the queue
2736 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 2690 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
2737 2691
2692 +def concat_jsons(flist, outfilename):
2693 + """
2694 + Reads JSON files computed by the geometry jobs and merge them into a smaller
2695 + number of files
2696 + """
2697 +
2698 + result = []
2699 + for f in flist:
2700 + # if not path.isfile(f):
2701 + # continue:
2702 + with open(f, "rb") as infile:
2703 + result.append(json.load(infile))
2704 +
2705 + # write the files
2706 + with open(outfilename, 'w', encoding='utf-8') as f:
2707 + json.dump(result, f, indent=4)
2708 +
2738 def process_jobs(joblist): 2709 def process_jobs(joblist):
2739 """ 2710 """
2740 Starts a Pool to run the Job() objects in joblist. 2711 Starts a Pool to run the Job() objects in joblist.
...@@ -2759,7 +2730,6 @@ def process_jobs(joblist): ...@@ -2759,7 +2730,6 @@ def process_jobs(joblist):
2759 print("Something went wrong") 2730 print("Something went wrong")
2760 2731
2761 if __name__ == "__main__": 2732 if __name__ == "__main__":
2762 -
2763 os.makedirs(runDir + "/results/figures/", exist_ok=True) 2733 os.makedirs(runDir + "/results/figures/", exist_ok=True)
2764 2734
2765 # parse options 2735 # parse options
...@@ -2897,29 +2867,29 @@ if __name__ == "__main__": ...@@ -2897,29 +2867,29 @@ if __name__ == "__main__":
2897 2867
2898 # Do general family statistics 2868 # Do general family statistics
2899 2869
2900 - joblist.append(Job(function=stats_len)) # Computes figures about chain lengths 2870 + # joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
2901 - joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) 2871 + # joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
2902 - for f in famlist: 2872 + # for f in famlist:
2903 - joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) 2873 + # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
2904 - if f not in ignored: 2874 + # if f not in ignored:
2905 - joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) 2875 + # joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
2906 2876
2907 2877
2908 # Do geometric measures on all chains 2878 # Do geometric measures on all chains
2909 2879
2910 - if n_unmapped_chains: 2880 + # if n_unmapped_chains:
2911 - os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True) 2881 + # os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
2912 - liste_struct=os.listdir(path_to_3D_data + "renumbered_rna_only") 2882 + # liste_struct = os.listdir(path_to_3D_data + "renumbered_rna_only")
2913 - f_prec = os.listdir(path_to_3D_data + "renumbered_rna_only")[0] 2883 + # if '4zdo_1_E.cif' in liste_struct:
2914 - if '4zdo_1_E.cif' in liste_struct: 2884 + # liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
2915 - liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now 2885 + # if '4zdp_1_E.cif' in liste_struct:
2916 - if '4zdp_1_E.cif' in liste_struct: 2886 + # liste_struct.remove('4zdp_1_E.cif')
2917 - liste_struct.remove('4zdp_1_E.cif') 2887 + # for f in liste_struct:
2918 - for f in liste_struct: 2888 + # if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
2919 - joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances 2889 + # joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
2920 2890
2921 2891
2922 - process_jobs(joblist) 2892 + # process_jobs(joblist)
2923 2893
2924 # Now process the memory-heavy tasks family by family 2894 # Now process the memory-heavy tasks family by family
2925 if DO_AVG_DISTANCE_MATRIX: 2895 if DO_AVG_DISTANCE_MATRIX:
...@@ -2935,26 +2905,26 @@ if __name__ == "__main__": ...@@ -2935,26 +2905,26 @@ if __name__ == "__main__":
2935 2905
2936 # finish the work after the parallel portions 2906 # finish the work after the parallel portions
2937 2907
2938 - per_chain_stats() # per chain base frequencies en basepair types 2908 + # per_chain_stats() # per chain base frequencies en basepair types
2939 - seq_idty() # identity matrices from pre-computed .npy matrices 2909 + # seq_idty() # identity matrices from pre-computed .npy matrices
2940 - stats_pairs() 2910 + # stats_pairs()
2941 2911
2942 if n_unmapped_chains: 2912 if n_unmapped_chains:
2943 - general_stats() 2913 + # general_stats()
2944 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) 2914 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
2945 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) 2915 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
2946 joblist = [] 2916 joblist = []
2947 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv'))) 2917 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')))
2948 - if DO_HIRE_RNA_MEASURES: 2918 + # if DO_HIRE_RNA_MEASURES:
2949 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv'))) 2919 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')))
2950 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_hire_RNA.csv'))) 2920 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_hire_RNA.csv')))
2951 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv'))) 2921 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv')))
2952 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs.csv'))) 2922 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs.csv')))
2953 - if DO_WADLEY_ANALYSIS: 2923 + # if DO_WADLEY_ANALYSIS:
2954 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances_wadley.csv'))) 2924 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances_wadley.csv')))
2955 - joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/angles/', 'angles_plans_wadley.csv'))) 2925 + # joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv')))
2956 - process_jobs(joblist) 2926 + # process_jobs(joblist)
2957 - joblist = [] 2927 + # joblist = []
2958 joblist.append(Job(function=gmm_aa_dists, args=())) 2928 joblist.append(Job(function=gmm_aa_dists, args=()))
2959 joblist.append(Job(function=gmm_aa_torsions, args=())) 2929 joblist.append(Job(function=gmm_aa_torsions, args=()))
2960 if DO_HIRE_RNA_MEASURES: 2930 if DO_HIRE_RNA_MEASURES:
...@@ -2964,4 +2934,5 @@ if __name__ == "__main__": ...@@ -2964,4 +2934,5 @@ if __name__ == "__main__":
2964 joblist.append(Job(function=gmm_wadley, args=())) 2934 joblist.append(Job(function=gmm_wadley, args=()))
2965 if len(joblist): 2935 if len(joblist):
2966 process_jobs(joblist) 2936 process_jobs(joblist)
2937 + merge_jsons()
2967 2938
......