Louis BECQUEY

script to measure angles + details

...@@ -4,6 +4,7 @@ log_of_the_run.sh ...@@ -4,6 +4,7 @@ log_of_the_run.sh
4 4
5 # results 5 # results
6 results/* 6 results/*
7 +archive/*
7 logs/* 8 logs/*
8 9
9 # temporary results files 10 # temporary results files
...@@ -13,4 +14,4 @@ esl* ...@@ -13,4 +14,4 @@ esl*
13 # environment stuff 14 # environment stuff
14 .vscode/ 15 .vscode/
15 *.pyc 16 *.pyc
16 -__pycache__/
...\ No newline at end of file ...\ No newline at end of file
17 +__pycache__/
......
...@@ -1206,7 +1206,7 @@ class Pipeline: ...@@ -1206,7 +1206,7 @@ class Pipeline:
1206 self.known_issues = [ x[:-1] for x in issues.readlines() if not '-' in x ] 1206 self.known_issues = [ x[:-1] for x in issues.readlines() if not '-' in x ]
1207 if self.USE_KNOWN_ISSUES: 1207 if self.USE_KNOWN_ISSUES:
1208 print("\t> Ignoring known issues:") 1208 print("\t> Ignoring known issues:")
1209 - print(" ".join(self.known_issues") 1209 + print(" ".join(self.known_issues))
1210 1210
1211 if self.HOMOLOGY: 1211 if self.HOMOLOGY:
1212 # Ask Rfam if some are mapped to Rfam families 1212 # Ask Rfam if some are mapped to Rfam families
...@@ -1263,8 +1263,7 @@ class Pipeline: ...@@ -1263,8 +1263,7 @@ class Pipeline:
1263 conn.close() 1263 conn.close()
1264 1264
1265 if self.SELECT_ONLY is not None: 1265 if self.SELECT_ONLY is not None:
1266 - self.update = [ 1266 + self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ]
1267 - c for c in self.update if c.chain_label == self.SELECT_ONLY]
1268 1267
1269 self.n_chains = len(self.update) 1268 self.n_chains = len(self.update)
1270 print(str(self.n_chains) + " RNA chains of interest.") 1269 print(str(self.n_chains) + " RNA chains of interest.")
......
...@@ -1611,3 +1611,15 @@ ...@@ -1611,3 +1611,15 @@
1611 7jz0_1_F 1611 7jz0_1_F
1612 7k00_1_5 1612 7k00_1_5
1613 7k00_1_B 1613 7k00_1_B
1614 +1qzb_1_B_1-73
1615 +1qza_1_B_1-73
1616 +5zzm_1_M_3-118
1617 +5zzm_1_N_1-2904
1618 +3dg2_1_B_1-2904
1619 +3dg0_1_B_1-2904
1620 +3dg4_1_B_1-2904
1621 +3dg5_1_B_1-2904
1622 +3dg2_1_A_1-1542
1623 +3dg0_1_A_1-1542
1624 +3dg4_1_A_1-1542
1625 +3dg5_1_A_1-1542
......
...@@ -4837,3 +4837,39 @@ Sequence is too short. (< 5 resolved nts) ...@@ -4837,3 +4837,39 @@ Sequence is too short. (< 5 resolved nts)
4837 5hjz_1_C 4837 5hjz_1_C
4838 Sequence is too short. (< 5 resolved nts) 4838 Sequence is too short. (< 5 resolved nts)
4839 4839
4840 +1qzb_1_B_1-73
4841 +DSSR warning 1qzb.json: no nucleotides found. Ignoring 1qzb_1_B_1-73.
4842 +
4843 +1qza_1_B_1-73
4844 +DSSR warning 1qza.json: no nucleotides found. Ignoring 1qza_1_B_1-73.
4845 +
4846 +5zzm_1_M_3-118
4847 +DSSR warning 5zzm.json: no nucleotides found. Ignoring 5zzm_1_M_3-118.
4848 +
4849 +5zzm_1_N_1-2904
4850 +DSSR warning 5zzm.json: no nucleotides found. Ignoring 5zzm_1_N_1-2904.
4851 +
4852 +3dg2_1_B_1-2904
4853 +DSSR warning 3dg2.json: no nucleotides found. Ignoring 3dg2_1_B_1-2904.
4854 +
4855 +3dg0_1_B_1-2904
4856 +DSSR warning 3dg0.json: no nucleotides found. Ignoring 3dg0_1_B_1-2904.
4857 +
4858 +3dg4_1_B_1-2904
4859 +DSSR warning 3dg4.json: no nucleotides found. Ignoring 3dg4_1_B_1-2904.
4860 +
4861 +3dg5_1_B_1-2904
4862 +DSSR warning 3dg5.json: no nucleotides found. Ignoring 3dg5_1_B_1-2904.
4863 +
4864 +3dg2_1_A_1-1542
4865 +DSSR warning 3dg2.json: no nucleotides found. Ignoring 3dg2_1_A_1-1542.
4866 +
4867 +3dg0_1_A_1-1542
4868 +DSSR warning 3dg0.json: no nucleotides found. Ignoring 3dg0_1_A_1-1542.
4869 +
4870 +3dg4_1_A_1-1542
4871 +DSSR warning 3dg4.json: no nucleotides found. Ignoring 3dg4_1_A_1-1542.
4872 +
4873 +3dg5_1_A_1-1542
4874 +DSSR warning 3dg5.json: no nucleotides found. Ignoring 3dg5_1_A_1-1542.
4875 +
......
1 +#!/usr/bin/python3.8
2 +
3 +# usage : pass as an argument a folder containing .cif files of RNA chains, like those produced by RNANet:
4 +# usage : ./measure_bonds_and_angles.py ~/Data/RNA/3D/rna_only
5 +# OR
6 +# usage : ./measure_bonds_and_angles.py ~/Data/RNA/3D/rna_mapped_to_Rfam
7 +
8 +from Bio.PDB import MMCIFParser
9 +from Bio.PDB.vectors import Vector, calc_angle
10 +from sys import argv
11 +from tqdm import tqdm
12 +import multiprocessing as mp
13 +import matplotlib.pyplot as plt
14 +import numpy as np
15 +import os, signal
16 +
17 +def measure_in_chain(f):
18 + mmcif_parser = MMCIFParser()
19 + s = mmcif_parser.get_structure('null', os.path.abspath(path_to_3D_data + f))
20 + chain = next(s[0].get_chains()) # Assume only one chain per .cif file
21 +
22 + c_to_p = []
23 + p_to_c = []
24 + c_p_c = []
25 + p_c_p = []
26 + last_p = None
27 + last_c = None
28 + nres = 0
29 + for res in chain:
30 + nres += 1
31 +
32 + # Get the new c1' and p atoms
33 + atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
34 + atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
35 + if len(atom_c1p) + len(atom_p) != 2:
36 + last_c = None
37 + last_p = None
38 + continue
39 + atom_c1p = Vector(atom_c1p[0])
40 + atom_p = Vector(atom_p[0])
41 +
42 + if last_c is not None: # There was a previous residue
43 + # Get the C1'(i-1) -> P distance
44 + c_to_p.append((last_c - atom_p).norm()) # the C1'(i-1) -> P bond of the theta angle
45 +
46 + # Get the C1'(i-1)-P(i)-C1'(i) flat angle
47 + c_p_c.append(calc_angle(last_c, atom_p, atom_c1p))
48 +
49 + # Get the P(i-1)-C1'(i-1)-P(i) flat angle
50 + p_c_p.append(calc_angle(last_p, last_c, atom_p))
51 +
52 + p_to_c.append((atom_c1p - atom_p).norm()) # the P -> C1' bond of the eta angle
53 + last_c = atom_c1p
54 + last_p = atom_p
55 +
56 + c_to_p = np.array(c_to_p, dtype=np.float16)
57 + p_to_c = np.array(p_to_c, dtype=np.float16)
58 + c_p_c = np.array(c_p_c, dtype=np.float16)
59 + p_c_p = np.array(p_c_p, dtype=np.float16)
60 + c_to_p = c_to_p[~np.isnan(c_to_p)]
61 + p_to_c = p_to_c[~np.isnan(p_to_c)]
62 + c_p_c = c_p_c[~np.isnan(c_p_c)]
63 + p_c_p = p_c_p[~np.isnan(p_c_p)]
64 +
65 + return (c_to_p, p_to_c, c_p_c, p_c_p)
66 +
67 +def init_worker(tqdm_lock=None):
68 + signal.signal(signal.SIGINT, signal.SIG_IGN)
69 + if tqdm_lock is not None:
70 + tqdm.set_lock(tqdm_lock)
71 +
72 +def measure_all_dist_angles():
73 + path_to_3D_data = argv[1]
74 +
75 + if path_to_3D_data[-1] != '/':
76 + path_to_3D_data += '/'
77 +
78 + r_cp = np.array([], dtype=np.float16)
79 + r_pc = np.array([], dtype=np.float16)
80 + flat_angles_cpc = np.array([], dtype=np.float16)
81 + flat_angles_pcp = np.array([], dtype=np.float16)
82 + p = mp.Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=os.cpu_count())
83 + pbar = tqdm(total=len(os.listdir(path_to_3D_data)), desc="Scanning RNA chains", position=0, leave=True)
84 + try:
85 + nchains = 0
86 + for _, r in enumerate(p.imap_unordered(measure_in_chain, os.listdir(path_to_3D_data))):
87 + pbar.update(1)
88 + nchains += 1
89 + r_cp = np.hstack([r_cp, r[0]])
90 + r_pc = np.hstack([r_pc, r[1]])
91 + flat_angles_cpc = np.hstack([flat_angles_cpc, r[2]])
92 + flat_angles_pcp = np.hstack([flat_angles_pcp, r[3]])
93 + p.close()
94 + p.join()
95 + pbar.close()
96 + np.savez("measures.npz", c_p=r_cp, p_c=r_pc, c_p_c=flat_angles_pcp, p_c_p=flat_angles_pcp)
97 + except KeyboardInterrupt:
98 + print("Caught Ctrl-C, quitting")
99 + p.terminate()
100 + p.join()
101 + pbar.close()
102 + except Exception as e:
103 + print(e)
104 + p.terminate()
105 + p.join()
106 + pbar.close()
107 + np.savez("measures_incomplete.npz", c_p=r_cp, p_c=r_pc, c_p_c=flat_angles_pcp, p_c_p=flat_angles_pcp)
108 +
109 +
110 +if __name__ == "__main__":
111 + # Do the computations and save/reload the data
112 +
113 + # measure_all_dist_angles()
114 +
115 + d = np.load("measures.npz")
116 + c_p = d["c_p"]
117 + p_c = d["p_c"]
118 + p_c_p = d["p_c_p"]
119 + c_p_c = d["c_p_c"]
120 +
121 + # Plot stuff
122 + plt.figure(figsize=(6,4), dpi=300)
123 + plt.hist(c_p)
124 + plt.savefig("lengths.png")
125 + plt.close()
126 +
127 + # print(f"Final values: P->C1' is \033[32m{avg[0]/10:.3f} ± {avg[1]/10:.3f} nm\033[0m, "
128 + # f"C1'->P is \033[32m{avg[2]/10:.3f} ± {avg[3]/10:.3f} nm\033[0m, "
129 + # f"angles C-P-C \033[32m{avg[4]:.2f} ± {avg[5]:.2f}\033[0m and P-C-P \033[32m{avg[6]:.2f} ± {avg[7]:.2f}\033[0m")