Showing
1 changed file
with
51 additions
and
43 deletions
... | @@ -230,7 +230,7 @@ class Chain: | ... | @@ -230,7 +230,7 @@ class Chain: |
230 | warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) | 230 | warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) |
231 | self.delete_me = True | 231 | self.delete_me = True |
232 | self.error_messages = f"Could not load existing {self.pdb_id}.json file: {e}" | 232 | self.error_messages = f"Could not load existing {self.pdb_id}.json file: {e}" |
233 | - return 1 | 233 | + return None |
234 | 234 | ||
235 | # Print eventual warnings given by DSSR, and abort if there are some | 235 | # Print eventual warnings given by DSSR, and abort if there are some |
236 | if "warning" in json_object.keys(): | 236 | if "warning" in json_object.keys(): |
... | @@ -239,7 +239,7 @@ class Chain: | ... | @@ -239,7 +239,7 @@ class Chain: |
239 | no_nts_set.add(self.pdb_id) | 239 | no_nts_set.add(self.pdb_id) |
240 | self.delete_me = True | 240 | self.delete_me = True |
241 | self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}." | 241 | self.error_messages = f"DSSR warning {self.pdb_id}.json: {json_object['warning']}. Ignoring {self.chain_label}." |
242 | - return 1 | 242 | + return None |
243 | 243 | ||
244 | ############################################ | 244 | ############################################ |
245 | # Create the data-frame | 245 | # Create the data-frame |
... | @@ -252,11 +252,11 @@ class Chain: | ... | @@ -252,11 +252,11 @@ class Chain: |
252 | 252 | ||
253 | # Assert nucleotides of the chain are found | 253 | # Assert nucleotides of the chain are found |
254 | if df.empty: | 254 | if df.empty: |
255 | - warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) | 255 | + warn(f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.") |
256 | no_nts_set.add(self.pdb_id) | 256 | no_nts_set.add(self.pdb_id) |
257 | self.delete_me = True | 257 | self.delete_me = True |
258 | - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 258 | + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." |
259 | - return 1 | 259 | + return None |
260 | 260 | ||
261 | # Remove low pertinence or undocumented descriptors, convert angles values | 261 | # Remove low pertinence or undocumented descriptors, convert angles values |
262 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", | 262 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", |
... | @@ -272,7 +272,7 @@ class Chain: | ... | @@ -272,7 +272,7 @@ class Chain: |
272 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) | 272 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) |
273 | self.delete_me = True | 273 | self.delete_me = True |
274 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" | 274 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" |
275 | - return 1 | 275 | + return None |
276 | 276 | ||
277 | ############################################# | 277 | ############################################# |
278 | # Solve some common issues and drop ligands | 278 | # Solve some common issues and drop ligands |
... | @@ -303,7 +303,7 @@ class Chain: | ... | @@ -303,7 +303,7 @@ class Chain: |
303 | no_nts_set.add(self.pdb_id) | 303 | no_nts_set.add(self.pdb_id) |
304 | self.delete_me = True | 304 | self.delete_me = True |
305 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 305 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." |
306 | - return 1 | 306 | + return None |
307 | 307 | ||
308 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 308 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one |
309 | if df.iloc[0,0] != 1: | 309 | if df.iloc[0,0] != 1: |
... | @@ -355,11 +355,11 @@ class Chain: | ... | @@ -355,11 +355,11 @@ class Chain: |
355 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view | 355 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view |
356 | except IndexError: | 356 | except IndexError: |
357 | warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " | 357 | warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " |
358 | - f"{self.pdb_end} ({'not' if not self.inferred else ''} inferred). Ignoring chain {self.chain_label}.", error=True) | 358 | + f"{self.pdb_end} ({'not ' if not self.inferred else ''}inferred). Ignoring chain {self.chain_label}.") |
359 | no_nts_set.add(self.pdb_id) | 359 | no_nts_set.add(self.pdb_id) |
360 | self.delete_me = True | 360 | self.delete_me = True |
361 | - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 361 | + self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." |
362 | - return 1 | 362 | + return None |
363 | 363 | ||
364 | # Add eventual missing rows because of unsolved residues in the chain. | 364 | # Add eventual missing rows because of unsolved residues in the chain. |
365 | # Sometimes, the 3D structure is REALLY shorter than the family it's mapped to, | 365 | # Sometimes, the 3D structure is REALLY shorter than the family it's mapped to, |
... | @@ -500,9 +500,18 @@ class Chain: | ... | @@ -500,9 +500,18 @@ class Chain: |
500 | self.seq_to_align = "".join(df.nt_align_code) | 500 | self.seq_to_align = "".join(df.nt_align_code) |
501 | self.length = len([ x for x in self.seq_to_align if x != "-" ]) | 501 | self.length = len([ x for x in self.seq_to_align if x != "-" ]) |
502 | 502 | ||
503 | - #################################### | 503 | + # Remove too short chains |
504 | - # Save everything to database | 504 | + if self.length < 5: |
505 | - #################################### | 505 | + warn(f"{self.chain_label} sequence is too short, let's ignore it.\t") |
506 | + self.delete_me = True | ||
507 | + self.error_messages = "Sequence is too short. (< 5 resolved nts)" | ||
508 | + return None | ||
509 | + | ||
510 | + return df | ||
511 | + | ||
512 | + def register_chain(self, df): | ||
513 | + """Saves the extracted 3D data to the database. | ||
514 | + """ | ||
506 | 515 | ||
507 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 516 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
508 | # Register the chain in table chain | 517 | # Register the chain in table chain |
... | @@ -535,26 +544,19 @@ class Chain: | ... | @@ -535,26 +544,19 @@ class Chain: |
535 | AND chain_name='{self.pdb_chain_id}' | 544 | AND chain_name='{self.pdb_chain_id}' |
536 | AND rfam_acc IS NULL;""")[0][0] | 545 | AND rfam_acc IS NULL;""")[0][0] |
537 | 546 | ||
538 | - # Add the nucleotides | 547 | + # Add the nucleotides if the chain is not an issue |
539 | - sql_execute(conn, f""" | 548 | + if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know |
540 | - INSERT OR IGNORE INTO nucleotide | 549 | + sql_execute(conn, f""" |
541 | - (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, | 550 | + INSERT OR IGNORE INTO nucleotide |
542 | - epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 551 | + (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, |
543 | - v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, | 552 | + epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
544 | - paired, pair_type_LW, pair_type_DSSR, nb_interact) | 553 | + v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, |
545 | - VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 554 | + paired, pair_type_LW, pair_type_DSSR, nb_interact) |
546 | - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 555 | + VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, |
547 | - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", | 556 | + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, |
548 | - many=True, data=list(df.to_records(index=False)), warn_every=10) | 557 | + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", |
558 | + many=True, data=list(df.to_records(index=False)), warn_every=10) | ||
549 | 559 | ||
550 | - # Remove too short chains | ||
551 | - if self.length < 5: | ||
552 | - warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True) | ||
553 | - self.delete_me = True | ||
554 | - self.error_messages = "Sequence is too short. (< 5 resolved nts)" | ||
555 | - return 1 | ||
556 | - | ||
557 | - return 0 | ||
558 | 560 | ||
559 | def remap(self, columns_to_save, s_seq): | 561 | def remap(self, columns_to_save, s_seq): |
560 | """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position. | 562 | """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position. |
... | @@ -1355,13 +1357,13 @@ class Pipeline: | ... | @@ -1355,13 +1357,13 @@ class Pipeline: |
1355 | conn = sqlite3.connect(runDir + "/results/RNANet.db") | 1357 | conn = sqlite3.connect(runDir + "/results/RNANet.db") |
1356 | 1358 | ||
1357 | # Assert every structure is used | 1359 | # Assert every structure is used |
1358 | - r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain WHERE issue = 0);""") | 1360 | + r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain);""") |
1359 | if len(r) and r[0][0] is not None: | 1361 | if len(r) and r[0][0] is not None: |
1360 | warn("Structures without referenced chains have been detected.") | 1362 | warn("Structures without referenced chains have been detected.") |
1361 | print(" ".join([x[0] for x in r])) | 1363 | print(" ".join([x[0] for x in r])) |
1362 | 1364 | ||
1363 | # Assert every chain is attached to a structure | 1365 | # Assert every chain is attached to a structure |
1364 | - r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure) AND issue = 0;""") | 1366 | + r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""") |
1365 | if len(r) and r[0][0] is not None: | 1367 | if len(r) and r[0][0] is not None: |
1366 | warn("Chains without referenced structures have been detected") | 1368 | warn("Chains without referenced structures have been detected") |
1367 | print(" ".join([str(x[1])+'-'+str(x[0]) for x in r])) | 1369 | print(" ".join([str(x[1])+'-'+str(x[0]) for x in r])) |
... | @@ -1371,11 +1373,16 @@ class Pipeline: | ... | @@ -1371,11 +1373,16 @@ class Pipeline: |
1371 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain | 1373 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain |
1372 | WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) | 1374 | WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) |
1373 | GROUP BY rfam_acc;""") | 1375 | GROUP BY rfam_acc;""") |
1374 | - if len(r) and r[0][0] is not None: | 1376 | + try: |
1375 | - warn("Chains were not remapped:") | 1377 | + if len(r) and r[0][0] is not None: |
1376 | - for x in r: | 1378 | + warn("Chains were not remapped:") |
1377 | - print(str(x[0]) + " chains of family " + x[1]) | 1379 | + for x in r: |
1378 | - | 1380 | + print(str(x[0]) + " chains of family " + x[1]) |
1381 | + except TypeError as e: | ||
1382 | + print(r) | ||
1383 | + print(next(r)) | ||
1384 | + print(e) | ||
1385 | + exit() | ||
1379 | # # TODO : Optimize this (too slow) | 1386 | # # TODO : Optimize this (too slow) |
1380 | # # check if some columns are missing in the remappings: | 1387 | # # check if some columns are missing in the remappings: |
1381 | # r = sql_ask_database(conn, """SELECT c.chain_id, c.structure_id, c.chain_name, c.rfam_acc, r.index_chain, r.index_ali | 1388 | # r = sql_ask_database(conn, """SELECT c.chain_id, c.structure_id, c.chain_name, c.rfam_acc, r.index_chain, r.index_ali |
... | @@ -1897,7 +1904,8 @@ def work_build_chain(c, extract, khetatm, retrying=False): | ... | @@ -1897,7 +1904,8 @@ def work_build_chain(c, extract, khetatm, retrying=False): |
1897 | 1904 | ||
1898 | # extract the 3D descriptors | 1905 | # extract the 3D descriptors |
1899 | if not c.delete_me: | 1906 | if not c.delete_me: |
1900 | - c.extract_3D_data() | 1907 | + df = c.extract_3D_data() |
1908 | + c.register_chain(df) | ||
1901 | 1909 | ||
1902 | # Small check | 1910 | # Small check |
1903 | if not c.delete_me: | 1911 | if not c.delete_me: |
... | @@ -2027,7 +2035,7 @@ def work_realign(rfam_acc): | ... | @@ -2027,7 +2035,7 @@ def work_realign(rfam_acc): |
2027 | notify("Aligned new sequences together") | 2035 | notify("Aligned new sequences together") |
2028 | 2036 | ||
2029 | # Detect doublons and remove them | 2037 | # Detect doublons and remove them |
2030 | - existing_stk = AlignIO.parse(existing_ali_path, "stk") | 2038 | + existing_stk = AlignIO.parse(existing_ali_path, "stockholm") |
2031 | existing_ids = [ r.id for r in existing_stk ] | 2039 | existing_ids = [ r.id for r in existing_stk ] |
2032 | del existing_stk | 2040 | del existing_stk |
2033 | new_stk = AlignIO.parse(new_ali_path, "stk") | 2041 | new_stk = AlignIO.parse(new_ali_path, "stk") |
... | @@ -2281,7 +2289,7 @@ if __name__ == "__main__": | ... | @@ -2281,7 +2289,7 @@ if __name__ == "__main__": |
2281 | print("> Retrying to annotate some structures which just failed.", flush=True) | 2289 | print("> Retrying to annotate some structures which just failed.", flush=True) |
2282 | pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # | 2290 | pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # |
2283 | pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory | 2291 | pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory |
2284 | - print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") | 2292 | + print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).") |
2285 | if len(no_nts_set): | 2293 | if len(no_nts_set): |
2286 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) | 2294 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) |
2287 | if len(weird_mappings): | 2295 | if len(weird_mappings): |
... | @@ -2311,7 +2319,7 @@ if __name__ == "__main__": | ... | @@ -2311,7 +2319,7 @@ if __name__ == "__main__": |
2311 | rfam_acc_to_download[c.rfam_fam] = [ c ] | 2319 | rfam_acc_to_download[c.rfam_fam] = [ c ] |
2312 | else: | 2320 | else: |
2313 | rfam_acc_to_download[c.rfam_fam].append(c) | 2321 | rfam_acc_to_download[c.rfam_fam].append(c) |
2314 | - print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences:") | 2322 | + print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") |
2315 | pp.fam_list = sorted(rfam_acc_to_download.keys()) | 2323 | pp.fam_list = sorted(rfam_acc_to_download.keys()) |
2316 | 2324 | ||
2317 | if len(pp.fam_list): | 2325 | if len(pp.fam_list): | ... | ... |
-
Please register or login to post a comment