Louis BECQUEY

Solved crashes with --no-homology

...@@ -155,7 +155,7 @@ class Chain: ...@@ -155,7 +155,7 @@ class Chain:
155 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. 155 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
156 """ 156 """
157 157
158 - if (self.pdb_end - self.pdb_start): 158 + if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
159 status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" 159 status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
160 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" 160 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
161 else: 161 else:
...@@ -182,7 +182,7 @@ class Chain: ...@@ -182,7 +182,7 @@ class Chain:
182 # Extract the desired chain 182 # Extract the desired chain
183 c = s[model_idx][self.pdb_chain_id] 183 c = s[model_idx][self.pdb_chain_id]
184 184
185 - if (self.pdb_end - self.pdb_start): 185 + if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
186 # # Pay attention to residue numbering 186 # # Pay attention to residue numbering
187 # first_number = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number' 187 # first_number = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number'
188 # if self.pdb_start < self.pdb_end: 188 # if self.pdb_start < self.pdb_end:
...@@ -309,6 +309,28 @@ class Chain: ...@@ -309,6 +309,28 @@ class Chain:
309 if df.iloc[0,0] != 1: 309 if df.iloc[0,0] != 1:
310 st = df.iloc[0,0] -1 310 st = df.iloc[0,0] -1
311 df.iloc[:, 0] -= st 311 df.iloc[:, 0] -= st
312 +
313 +
314 + # Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value
315 + # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
316 + diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
317 + for i in sorted(diff):
318 + # check if a nucleotide numbered +1000 exists
319 + looked_for = df[df.index_chain == i].nt_resnum.values[0]
320 + found = None
321 + for nt in nts:
322 + if nt['chain_name'] != self.pdb_chain_id:
323 + continue
324 + if nt['index_chain'] == i + 1 :
325 + found = nt
326 + break
327 + if found:
328 + df_row = pd.DataFrame([found], index=[i])[df.columns.values]
329 + df_row.iloc[0,1] = df.iloc[i,1]
330 + df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
331 + df.iloc[i+1:, 1] += 1
332 + else:
333 + warn(f"Missing index_chain {i} in {self.chain_label} !")
312 df = df.drop(df[df.index_chain < 0].index) # drop eventual ones with index_chain < the first residue (usually, ligands) 334 df = df.drop(df[df.index_chain < 0].index) # drop eventual ones with index_chain < the first residue (usually, ligands)
313 335
314 # Re-Assert some nucleotides still exist 336 # Re-Assert some nucleotides still exist
...@@ -352,6 +374,10 @@ class Chain: ...@@ -352,6 +374,10 @@ class Chain:
352 df = df.reset_index(drop=True) 374 df = df.reset_index(drop=True)
353 self.full_length = len(df.index_chain) 375 self.full_length = len(df.index_chain)
354 376
377 + #######################################
378 + # Compute new features
379 + #######################################
380 +
355 # Add a sequence column just for the alignments 381 # Add a sequence column just for the alignments
356 df['nt_align_code'] = [ str(x).upper() 382 df['nt_align_code'] = [ str(x).upper()
357 .replace('NAN', '-') # Unresolved nucleotides are gaps 383 .replace('NAN', '-') # Unresolved nucleotides are gaps
...@@ -360,11 +386,6 @@ class Chain: ...@@ -360,11 +386,6 @@ class Chain:
360 .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 386 .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
361 for x in df['nt_code'] ] 387 for x in df['nt_code'] ]
362 388
363 -
364 - #######################################
365 - # Compute new features
366 - #######################################
367 -
368 # One-hot encoding sequence 389 # One-hot encoding sequence
369 df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ] 390 df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
370 df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ] 391 df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
...@@ -464,6 +485,7 @@ class Chain: ...@@ -464,6 +485,7 @@ class Chain:
464 #################################### 485 ####################################
465 # Save everything to database 486 # Save everything to database
466 #################################### 487 ####################################
488 +
467 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 489 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
468 # Register the chain in table chain 490 # Register the chain in table chain
469 if self.pdb_start is not None: 491 if self.pdb_start is not None:
...@@ -472,13 +494,28 @@ class Chain: ...@@ -472,13 +494,28 @@ class Chain:
472 VALUES 494 VALUES
473 (?, ?, ?, ?, ?, ?, ?, ?) 495 (?, ?, ?, ?, ?, ?, ?, ?)
474 ON CONFLICT(structure_id, chain_name, rfam_acc) DO 496 ON CONFLICT(structure_id, chain_name, rfam_acc) DO
475 - UPDATE SET pdb_start=excluded.pdb_start, pdb_end=excluded.pdb_end, reversed=excluded.reversed, inferred=excluded.inferred, issue=excluded.issue;""", 497 + UPDATE SET pdb_start=excluded.pdb_start,
476 - data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.pdb_start), int(self.pdb_end), int(self.reversed), str(self.rfam_fam), int(self.inferred), int(self.delete_me))) 498 + pdb_end=excluded.pdb_end,
499 + reversed=excluded.reversed,
500 + inferred=excluded.inferred,
501 + issue=excluded.issue;""",
502 + data=(str(self.pdb_id), str(self.pdb_chain_id),
503 + int(self.pdb_start), int(self.pdb_end),
504 + int(self.reversed), str(self.rfam_fam),
505 + int(self.inferred), int(self.delete_me)))
477 # get the chain id 506 # get the chain id
478 - self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc='{self.rfam_fam}';")[0][0] 507 + self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
508 + WHERE structure_id='{self.pdb_id}'
509 + AND chain_name='{self.pdb_chain_id}'
510 + AND rfam_acc='{self.rfam_fam}';""")[0][0]
479 else: 511 else:
480 - sql_execute(conn, "INSERT INTO chain (structure_id, chain_name, issue) VALUES (?, ?, ?) ON CONFLICT(structure_id, chain_name) DO UPDATE SET issue=excluded.issue;", data=(str(self.pdb_id), int(self.pdb_chain_id), int(self.delete_me))) 512 + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?)
481 - self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc IS NULL;")[0][0] 513 + ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""",
514 + data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me)))
515 + self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
516 + WHERE structure_id='{self.pdb_id}'
517 + AND chain_name='{self.pdb_chain_id}'
518 + AND rfam_acc IS NULL;""")[0][0]
482 519
483 # Add the nucleotides 520 # Add the nucleotides
484 sql_execute(conn, f""" 521 sql_execute(conn, f"""
...@@ -492,7 +529,6 @@ class Chain: ...@@ -492,7 +529,6 @@ class Chain:
492 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 529 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
493 many=True, data=list(df.to_records(index=False)), warn_every=10) 530 many=True, data=list(df.to_records(index=False)), warn_every=10)
494 531
495 -
496 # Remove too short chains 532 # Remove too short chains
497 if self.length < 5: 533 if self.length < 5:
498 warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True) 534 warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
...@@ -1064,7 +1100,7 @@ class Pipeline: ...@@ -1064,7 +1100,7 @@ class Pipeline:
1064 if self.EXTRACT_CHAINS: 1100 if self.EXTRACT_CHAINS:
1065 if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): 1101 if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1066 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam 1102 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
1067 - if not self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_only"): 1103 + if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
1068 os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA 1104 os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
1069 1105
1070 # define and run jobs 1106 # define and run jobs
...@@ -1295,7 +1331,7 @@ class Pipeline: ...@@ -1295,7 +1331,7 @@ class Pipeline:
1295 r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""") 1331 r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""")
1296 if len(r) and r[0][0] is not None: 1332 if len(r) and r[0][0] is not None:
1297 warn("Chains without referenced structures have been detected") 1333 warn("Chains without referenced structures have been detected")
1298 - print(" ".join([x[1]+'-'+x[0] for x in r])) 1334 + print(" ".join([str(x[1])+'-'+str(x[0]) for x in r]))
1299 1335
1300 if self.HOMOLOGY: 1336 if self.HOMOLOGY:
1301 # check if chains have been re_mapped: 1337 # check if chains have been re_mapped:
...@@ -2187,6 +2223,7 @@ if __name__ == "__main__": ...@@ -2187,6 +2223,7 @@ if __name__ == "__main__":
2187 # At this point, the structure table is up to date 2223 # At this point, the structure table is up to date
2188 2224
2189 pp.build_chains(coeff_ncores=1.0) 2225 pp.build_chains(coeff_ncores=1.0)
2226 +
2190 if len(pp.to_retry): 2227 if len(pp.to_retry):
2191 # Redownload and re-annotate 2228 # Redownload and re-annotate
2192 print("> Retrying to annotate some structures which just failed.", flush=True) 2229 print("> Retrying to annotate some structures which just failed.", flush=True)
...@@ -2227,6 +2264,7 @@ if __name__ == "__main__": ...@@ -2227,6 +2264,7 @@ if __name__ == "__main__":
2227 pp.prepare_sequences() 2264 pp.prepare_sequences()
2228 pp.realign() 2265 pp.realign()
2229 2266
2267 +
2230 # At this point, the family table is up to date 2268 # At this point, the family table is up to date
2231 2269
2232 thr_idx_mgr = Manager() 2270 thr_idx_mgr = Manager()
......
...@@ -68,27 +68,30 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -68,27 +68,30 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
68 68
69 69
70 if not path.isfile(f"data/wadley_kernel_{angle}.npz"): 70 if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
71 - conn = sqlite3.connect("results/RNANet.db") 71 + # Extract the angle values of c2'-endo and c3'-endo nucleotides
72 - df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) 72 + with sqlite3.connect("results/RNANet.db") as conn:
73 - c2_endo_etas = df[angle].values.tolist() 73 + df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
74 - c2_endo_thetas = df["th"+angle].values.tolist() 74 + c2_endo_etas = df[angle].values.tolist()
75 - df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) 75 + c2_endo_thetas = df["th"+angle].values.tolist()
76 - c3_endo_etas = df[angle].values.tolist() 76 + df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
77 - c3_endo_thetas = df["th"+angle].values.tolist() 77 + c3_endo_etas = df[angle].values.tolist()
78 - conn.close() 78 + c3_endo_thetas = df["th"+angle].values.tolist()
79 +
80 + # Create arrays with (x,y) coordinates of the points
81 + values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
82 + values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
83 +
84 + # Approximate the density by a gaussian kernel
85 + kernel_c3 = st.gaussian_kde(values_c3)
86 + kernel_c2 = st.gaussian_kde(values_c2)
79 87
88 + # Create 100x100 regular (x,y,z) values for the plot
80 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] 89 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
81 positions = np.vstack([xx.ravel(), yy.ravel()]) 90 positions = np.vstack([xx.ravel(), yy.ravel()])
82 -
83 - values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
84 - kernel_c3 = st.gaussian_kde(values_c3)
85 f_c3 = np.reshape(kernel_c3(positions).T, xx.shape) 91 f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
86 - values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
87 - kernel_c2 = st.gaussian_kde(values_c2)
88 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape) 92 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
89 93
90 - 94 + # Save the data to an archive for later use without the need to recompute
91 - # Uncomment to save the data to an archive for later use without the need to recompute
92 np.savez(f"data/wadley_kernel_{angle}.npz", 95 np.savez(f"data/wadley_kernel_{angle}.npz",
93 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, 96 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
94 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, 97 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
...@@ -106,8 +109,10 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -106,8 +109,10 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
106 notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).") 109 notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
107 110
108 # exact counts: 111 # exact counts:
109 - hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]]) 112 + hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1),
110 - hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]]) 113 + range=[[0, 2*np.pi], [0, 2*np.pi]])
114 + hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1),
115 + range=[[0, 2*np.pi], [0, 2*np.pi]])
111 cmap = cm.get_cmap("jet") 116 cmap = cm.get_cmap("jet")
112 color_values = cmap(hist_c3.ravel()/hist_c3.max()) 117 color_values = cmap(hist_c3.ravel()/hist_c3.max())
113 118
...@@ -450,7 +455,7 @@ def seq_idty(): ...@@ -450,7 +455,7 @@ def seq_idty():
450 famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ] 455 famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
451 ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ] 456 ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
452 if len(ignored): 457 if len(ignored):
453 - print("Idty matrices: Ignoring families with only one chain:", " ".join(ignored)+'\n') 458 + print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
454 459
455 # compute distance matrices (or ignore if data/RF0****.npy exists) 460 # compute distance matrices (or ignore if data/RF0****.npy exists)
456 p = Pool(processes=8) 461 p = Pool(processes=8)
...@@ -476,7 +481,7 @@ def seq_idty(): ...@@ -476,7 +481,7 @@ def seq_idty():
476 conn.close() 481 conn.close()
477 482
478 # Plots plots plots 483 # Plots plots plots
479 - fig, axs = plt.subplots(5,13, figsize=(15,9)) 484 + fig, axs = plt.subplots(4,17, figsize=(17,5.75))
480 axs = axs.ravel() 485 axs = axs.ravel()
481 [axi.set_axis_off() for axi in axs] 486 [axi.set_axis_off() for axi in axs]
482 im = "" # Just to declare the variable, it will be set in the loop 487 im = "" # Just to declare the variable, it will be set in the loop
...@@ -495,7 +500,7 @@ def seq_idty(): ...@@ -495,7 +500,7 @@ def seq_idty():
495 D = D[idx1,:] 500 D = D[idx1,:]
496 D = D[:,idx1[::-1]] 501 D = D[:,idx1[::-1]]
497 im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D 502 im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
498 - ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)") 503 + ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
499 fig.tight_layout() 504 fig.tight_layout()
500 fig.subplots_adjust(wspace=0.1, hspace=0.3) 505 fig.subplots_adjust(wspace=0.1, hspace=0.3)
501 fig.colorbar(im, ax=axs[-1], shrink=0.8) 506 fig.colorbar(im, ax=axs[-1], shrink=0.8)
...@@ -537,10 +542,10 @@ if __name__ == "__main__": ...@@ -537,10 +542,10 @@ if __name__ == "__main__":
537 threads = [ 542 threads = [
538 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), 543 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
539 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), 544 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
540 - # th.Thread(target=stats_len), 545 + # th.Thread(target=stats_len), # computes figures
541 - # th.Thread(target=stats_freq), 546 + # th.Thread(target=stats_freq), # Updates the database
542 - # th.Thread(target=seq_idty), 547 + # th.Thread(target=seq_idty), # produces .npy files and seq idty figures
543 - # th.Thread(target=per_chain_stats) 548 + # th.Thread(target=per_chain_stats) # Updates the database
544 ] 549 ]
545 550
546 # Start the threads 551 # Start the threads
......