Louis BECQUEY

Registers ignored chains to database

Showing 1 changed file with 51 additions and 43 deletions
...@@ -230,7 +230,7 @@ class Chain: ...@@ -230,7 +230,7 @@ class Chain:
230 warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) 230 warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True)
231 self.delete_me = True 231 self.delete_me = True
232 self.error_messages = f"Could not load existing {self.pdb_id}.json file: {e}" 232 self.error_messages = f"Could not load existing {self.pdb_id}.json file: {e}"
233 - return 1 233 + return None
234 234
235 # Print eventual warnings given by DSSR, and abort if there are some 235 # Print eventual warnings given by DSSR, and abort if there are some
236 if "warning" in json_object.keys(): 236 if "warning" in json_object.keys():
...@@ -239,7 +239,7 @@ class Chain: ...@@ -239,7 +239,7 @@ class Chain:
239 no_nts_set.add(self.pdb_id) 239 no_nts_set.add(self.pdb_id)
240 self.delete_me = True 240 self.delete_me = True
241 self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}." 241 self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}."
242 - return 1 242 + return None
243 243
244 ############################################ 244 ############################################
245 # Create the data-frame 245 # Create the data-frame
...@@ -252,11 +252,11 @@ class Chain: ...@@ -252,11 +252,11 @@ class Chain:
252 252
253 # Assert nucleotides of the chain are found 253 # Assert nucleotides of the chain are found
254 if df.empty: 254 if df.empty:
255 - warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) 255 + warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.")
256 no_nts_set.add(self.pdb_id) 256 no_nts_set.add(self.pdb_id)
257 self.delete_me = True 257 self.delete_me = True
258 - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 258 + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry."
259 - return 1 259 + return None
260 260
261 # Remove low pertinence or undocumented descriptors, convert angles values 261 # Remove low pertinence or undocumented descriptors, convert angles values
262 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", 262 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
...@@ -272,7 +272,7 @@ class Chain: ...@@ -272,7 +272,7 @@ class Chain:
272 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) 272 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
273 self.delete_me = True 273 self.delete_me = True
274 self.error_messages = f"Error while parsing DSSR's json output:\n{e}" 274 self.error_messages = f"Error while parsing DSSR's json output:\n{e}"
275 - return 1 275 + return None
276 276
277 ############################################# 277 #############################################
278 # Solve some common issues and drop ligands 278 # Solve some common issues and drop ligands
...@@ -303,7 +303,7 @@ class Chain: ...@@ -303,7 +303,7 @@ class Chain:
303 no_nts_set.add(self.pdb_id) 303 no_nts_set.add(self.pdb_id)
304 self.delete_me = True 304 self.delete_me = True
305 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 305 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
306 - return 1 306 + return None
307 307
308 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 308 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
309 if df.iloc[0,0] != 1: 309 if df.iloc[0,0] != 1:
...@@ -355,11 +355,11 @@ class Chain: ...@@ -355,11 +355,11 @@ class Chain:
355 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view 355 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view
356 except IndexError: 356 except IndexError:
357 warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " 357 warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and "
358 - f"{self.pdb_end} ({'not' if not self.inferred else ''} inferred). Ignoring chain {self.chain_label}.", error=True) 358 + f"{self.pdb_end} ({'not ' if not self.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
359 no_nts_set.add(self.pdb_id) 359 no_nts_set.add(self.pdb_id)
360 self.delete_me = True 360 self.delete_me = True
361 - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 361 + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry."
362 - return 1 362 + return None
363 363
364 # Add eventual missing rows because of unsolved residues in the chain. 364 # Add eventual missing rows because of unsolved residues in the chain.
365 # Sometimes, the 3D structure is REALLY shorter than the family it's mapped to, 365 # Sometimes, the 3D structure is REALLY shorter than the family it's mapped to,
...@@ -500,9 +500,18 @@ class Chain: ...@@ -500,9 +500,18 @@ class Chain:
500 self.seq_to_align = "".join(df.nt_align_code) 500 self.seq_to_align = "".join(df.nt_align_code)
501 self.length = len([ x for x in self.seq_to_align if x != "-" ]) 501 self.length = len([ x for x in self.seq_to_align if x != "-" ])
502 502
503 - #################################### 503 + # Remove too short chains
504 - # Save everything to database 504 + if self.length < 5:
505 - #################################### 505 + warn(f"{self.chain_label} sequence is too short, let's ignore it.\t")
506 + self.delete_me = True
507 + self.error_messages = "Sequence is too short. (< 5 resolved nts)"
508 + return None
509 +
510 + return df
511 +
512 + def register_chain(self, df):
513 + """Saves the extracted 3D data to the database.
514 + """
506 515
507 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 516 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
508 # Register the chain in table chain 517 # Register the chain in table chain
...@@ -535,26 +544,19 @@ class Chain: ...@@ -535,26 +544,19 @@ class Chain:
535 AND chain_name='{self.pdb_chain_id}' 544 AND chain_name='{self.pdb_chain_id}'
536 AND rfam_acc IS NULL;""")[0][0] 545 AND rfam_acc IS NULL;""")[0][0]
537 546
538 - # Add the nucleotides 547 + # Add the nucleotides if the chain is not an issue
539 - sql_execute(conn, f""" 548 + if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
540 - INSERT OR IGNORE INTO nucleotide 549 + sql_execute(conn, f"""
541 - (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, 550 + INSERT OR IGNORE INTO nucleotide
542 - epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 551 + (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
543 - v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, 552 + epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
544 - paired, pair_type_LW, pair_type_DSSR, nb_interact) 553 + v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position,
545 - VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 554 + paired, pair_type_LW, pair_type_DSSR, nb_interact)
546 - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 555 + VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
547 - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 556 + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
548 - many=True, data=list(df.to_records(index=False)), warn_every=10) 557 + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
558 + many=True, data=list(df.to_records(index=False)), warn_every=10)
549 559
550 - # Remove too short chains
551 - if self.length < 5:
552 - warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
553 - self.delete_me = True
554 - self.error_messages = "Sequence is too short. (< 5 resolved nts)"
555 - return 1
556 -
557 - return 0
558 560
559 def remap(self, columns_to_save, s_seq): 561 def remap(self, columns_to_save, s_seq):
560 """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position. 562 """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position.
...@@ -1355,13 +1357,13 @@ class Pipeline: ...@@ -1355,13 +1357,13 @@ class Pipeline:
1355 conn = sqlite3.connect(runDir + "/results/RNANet.db") 1357 conn = sqlite3.connect(runDir + "/results/RNANet.db")
1356 1358
1357 # Assert every structure is used 1359 # Assert every structure is used
1358 - r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain WHERE issue = 0);""") 1360 + r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain);""")
1359 if len(r) and r[0][0] is not None: 1361 if len(r) and r[0][0] is not None:
1360 warn("Structures without referenced chains have been detected.") 1362 warn("Structures without referenced chains have been detected.")
1361 print(" ".join([x[0] for x in r])) 1363 print(" ".join([x[0] for x in r]))
1362 1364
1363 # Assert every chain is attached to a structure 1365 # Assert every chain is attached to a structure
1364 - r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure) AND issue = 0;""") 1366 + r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""")
1365 if len(r) and r[0][0] is not None: 1367 if len(r) and r[0][0] is not None:
1366 warn("Chains without referenced structures have been detected") 1368 warn("Chains without referenced structures have been detected")
1367 print(" ".join([str(x[1])+'-'+str(x[0]) for x in r])) 1369 print(" ".join([str(x[1])+'-'+str(x[0]) for x in r]))
...@@ -1371,11 +1373,16 @@ class Pipeline: ...@@ -1371,11 +1373,16 @@ class Pipeline:
1371 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain 1373 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain
1372 WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) 1374 WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
1373 GROUP BY rfam_acc;""") 1375 GROUP BY rfam_acc;""")
1374 - if len(r) and r[0][0] is not None: 1376 + try:
1375 - warn("Chains were not remapped:") 1377 + if len(r) and r[0][0] is not None:
1376 - for x in r: 1378 + warn("Chains were not remapped:")
1377 - print(str(x[0]) + " chains of family " + x[1]) 1379 + for x in r:
1378 - 1380 + print(str(x[0]) + " chains of family " + x[1])
1381 + except TypeError as e:
1382 + print(r)
1383 + print(next(r))
1384 + print(e)
1385 + exit()
1379 # # TODO : Optimize this (too slow) 1386 # # TODO : Optimize this (too slow)
1380 # # check if some columns are missing in the remappings: 1387 # # check if some columns are missing in the remappings:
1381 # r = sql_ask_database(conn, """SELECT c.chain_id, c.structure_id, c.chain_name, c.rfam_acc, r.index_chain, r.index_ali 1388 # r = sql_ask_database(conn, """SELECT c.chain_id, c.structure_id, c.chain_name, c.rfam_acc, r.index_chain, r.index_ali
...@@ -1897,7 +1904,8 @@ def work_build_chain(c, extract, khetatm, retrying=False): ...@@ -1897,7 +1904,8 @@ def work_build_chain(c, extract, khetatm, retrying=False):
1897 1904
1898 # extract the 3D descriptors 1905 # extract the 3D descriptors
1899 if not c.delete_me: 1906 if not c.delete_me:
1900 - c.extract_3D_data() 1907 + df = c.extract_3D_data()
1908 + c.register_chain(df)
1901 1909
1902 # Small check 1910 # Small check
1903 if not c.delete_me: 1911 if not c.delete_me:
...@@ -2027,7 +2035,7 @@ def work_realign(rfam_acc): ...@@ -2027,7 +2035,7 @@ def work_realign(rfam_acc):
2027 notify("Aligned new sequences together") 2035 notify("Aligned new sequences together")
2028 2036
2029 # Detect doublons and remove them 2037 # Detect doublons and remove them
2030 - existing_stk = AlignIO.parse(existing_ali_path, "stk") 2038 + existing_stk = AlignIO.parse(existing_ali_path, "stockholm")
2031 existing_ids = [ r.id for r in existing_stk ] 2039 existing_ids = [ r.id for r in existing_stk ]
2032 del existing_stk 2040 del existing_stk
2033 new_stk = AlignIO.parse(new_ali_path, "stk") 2041 new_stk = AlignIO.parse(new_ali_path, "stk")
...@@ -2281,7 +2289,7 @@ if __name__ == "__main__": ...@@ -2281,7 +2289,7 @@ if __name__ == "__main__":
2281 print("> Retrying to annotate some structures which just failed.", flush=True) 2289 print("> Retrying to annotate some structures which just failed.", flush=True)
2282 pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # 2290 pp.dl_and_annotate(retry=True, coeff_ncores=0.3) #
2283 pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory 2291 pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
2284 - print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") 2292 + print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).")
2285 if len(no_nts_set): 2293 if len(no_nts_set):
2286 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) 2294 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
2287 if len(weird_mappings): 2295 if len(weird_mappings):
...@@ -2311,7 +2319,7 @@ if __name__ == "__main__": ...@@ -2311,7 +2319,7 @@ if __name__ == "__main__":
2311 rfam_acc_to_download[c.rfam_fam] = [ c ] 2319 rfam_acc_to_download[c.rfam_fam] = [ c ]
2312 else: 2320 else:
2313 rfam_acc_to_download[c.rfam_fam].append(c) 2321 rfam_acc_to_download[c.rfam_fam].append(c)
2314 - print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:") 2322 + print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
2315 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2323 pp.fam_list = sorted(rfam_acc_to_download.keys())
2316 2324
2317 if len(pp.fam_list): 2325 if len(pp.fam_list):
......