Showing
1 changed file
with
59 additions
and
22 deletions
... | @@ -42,6 +42,7 @@ errsymb = '\U0000274C' | ... | @@ -42,6 +42,7 @@ errsymb = '\U0000274C' |
42 | LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112 | 42 | LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112 |
43 | SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111 | 43 | SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111 |
44 | no_nts_set = set() | 44 | no_nts_set = set() |
45 | +weird_mappings = set() | ||
45 | 46 | ||
46 | class NtPortionSelector(object): | 47 | class NtPortionSelector(object): |
47 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. | 48 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. |
... | @@ -273,13 +274,6 @@ class Chain: | ... | @@ -273,13 +274,6 @@ class Chain: |
273 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" | 274 | self.error_messages = f"Error while parsing DSSR's json output:\n{e}" |
274 | return 1 | 275 | return 1 |
275 | 276 | ||
276 | - # Remove nucleotides of the chain that are outside the Rfam mapping, if any | ||
277 | - if self.pdb_start and self.pdb_end: | ||
278 | - if self.pdb_start < self.pdb_end: | ||
279 | - df = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index) | ||
280 | - else: | ||
281 | - df = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index) | ||
282 | - | ||
283 | ############################################# | 277 | ############################################# |
284 | # Solve some common issues and drop ligands | 278 | # Solve some common issues and drop ligands |
285 | ############################################# | 279 | ############################################# |
... | @@ -298,6 +292,9 @@ class Chain: | ... | @@ -298,6 +292,9 @@ class Chain: |
298 | or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])): | 292 | or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])): |
299 | df = df.head(-1) | 293 | df = df.head(-1) |
300 | 294 | ||
295 | + # drop eventual nts with index_chain < the first residue (usually, ligands) | ||
296 | + df = df.drop(df[df.index_chain < 0].index) | ||
297 | + | ||
301 | # Assert some nucleotides still exist | 298 | # Assert some nucleotides still exist |
302 | try: | 299 | try: |
303 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view | 300 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view |
... | @@ -334,13 +331,31 @@ class Chain: | ... | @@ -334,13 +331,31 @@ class Chain: |
334 | df.iloc[i+1:, 1] += 1 | 331 | df.iloc[i+1:, 1] += 1 |
335 | else: | 332 | else: |
336 | warn(f"Missing index_chain {i} in {self.chain_label} !") | 333 | warn(f"Missing index_chain {i} in {self.chain_label} !") |
337 | - df = df.drop(df[df.index_chain < 0].index) # drop eventual ones with index_chain < the first residue (usually, ligands) | ||
338 | 334 | ||
339 | - # Re-Assert some nucleotides still exist | 335 | + # Remove nucleotides of the chain that are outside the Rfam mapping, if any |
336 | + if self.pdb_start and self.pdb_end: | ||
337 | + if self.pdb_start < self.pdb_end: | ||
338 | + newdf = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index) | ||
339 | + else: | ||
340 | + newdf = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index) | ||
341 | + | ||
342 | + if len(newdf.index_chain) > 0: | ||
343 | + # everything's okay | ||
344 | + df = newdf | ||
345 | + else: | ||
346 | + # There were nucleotides in this chain but we removed them all while | ||
347 | + # filtering the ones outside the Rfam mapping. | ||
348 | + # This probably means that, for this chain, the mapping is relative to | ||
349 | + # index_chain and not nt_resnum. | ||
350 | + warn(f"Assuming {self.chain_label}'s mapping to {self.rfam_fam} is an absolute position interval.") | ||
351 | + weird_mappings.add(self.chain_label + "." + self.rfam_fam) | ||
352 | + df = df.drop(df[(df.index_chain < self.pdb_start) | (df.index_chain > self.pdb_end)].index) | ||
353 | + | ||
340 | try: | 354 | try: |
341 | - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view | 355 | + l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view |
342 | except IndexError: | 356 | except IndexError: |
343 | - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) | 357 | + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " |
358 | + f"{self.pdb_end} ({'not' if not self.inferred else ''} inferred). Ignoring chain {self.chain_label}.", error=True) | ||
344 | no_nts_set.add(self.pdb_id) | 359 | no_nts_set.add(self.pdb_id) |
345 | self.delete_me = True | 360 | self.delete_me = True |
346 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 361 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." |
... | @@ -360,7 +375,7 @@ class Chain: | ... | @@ -360,7 +375,7 @@ class Chain: |
360 | # portion solved in 3D 1 |--------------|79 85|------------| 156 | 375 | # portion solved in 3D 1 |--------------|79 85|------------| 156 |
361 | # Rfam mapping 3 |------------------------------------------ ... -------| 3353 (yes larger, 'cause it could be inferred) | 376 | # Rfam mapping 3 |------------------------------------------ ... -------| 3353 (yes larger, 'cause it could be inferred) |
362 | # nt resnum 3 |--------------------------------| 156 | 377 | # nt resnum 3 |--------------------------------| 156 |
363 | - # index_chain 1 |-------------|77 83|------------| 149 (before correction) | 378 | + # index_chain 1 |-------------|77 83|------------| 154 |
364 | # expected data point 1 |--------------------------------| 154 | 379 | # expected data point 1 |--------------------------------| 154 |
365 | # | 380 | # |
366 | 381 | ||
... | @@ -537,6 +552,8 @@ class Chain: | ... | @@ -537,6 +552,8 @@ class Chain: |
537 | warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True) | 552 | warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True) |
538 | self.delete_me = True | 553 | self.delete_me = True |
539 | self.error_messages = "Sequence is too short. (< 5 resolved nts)" | 554 | self.error_messages = "Sequence is too short. (< 5 resolved nts)" |
555 | + return 1 | ||
556 | + | ||
540 | return 0 | 557 | return 0 |
541 | 558 | ||
542 | def remap(self, columns_to_save, s_seq): | 559 | def remap(self, columns_to_save, s_seq): |
... | @@ -1352,10 +1369,10 @@ class Pipeline: | ... | @@ -1352,10 +1369,10 @@ class Pipeline: |
1352 | if self.HOMOLOGY: | 1369 | if self.HOMOLOGY: |
1353 | # check if chains have been re_mapped: | 1370 | # check if chains have been re_mapped: |
1354 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain | 1371 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain |
1355 | - WHERE chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) | 1372 | + WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) |
1356 | GROUP BY rfam_acc;""") | 1373 | GROUP BY rfam_acc;""") |
1357 | if len(r) and r[0][0] is not None: | 1374 | if len(r) and r[0][0] is not None: |
1358 | - warn("Chains were not remapped (This happens if we have known issues for example):") | 1375 | + warn("Chains were not remapped:") |
1359 | for x in r: | 1376 | for x in r: |
1360 | print(str(x[0]) + " chains of family " + x[1]) | 1377 | print(str(x[0]) + " chains of family " + x[1]) |
1361 | 1378 | ||
... | @@ -1999,24 +2016,43 @@ def work_realign(rfam_acc): | ... | @@ -1999,24 +2016,43 @@ def work_realign(rfam_acc): |
1999 | # there are no new sequences to align... | 2016 | # there are no new sequences to align... |
2000 | return | 2017 | return |
2001 | 2018 | ||
2019 | + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" | ||
2020 | + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" | ||
2021 | + | ||
2002 | # Align the new sequences | 2022 | # Align the new sequences |
2003 | - with open(path_to_seq_data + f"realigned/{rfam_acc}_new.stk", 'w') as o: | 2023 | + with open(new_ali_path, 'w') as o: |
2004 | p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2024 | p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2005 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | 2025 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], |
2006 | stdout=o, stderr=subprocess.PIPE) | 2026 | stdout=o, stderr=subprocess.PIPE) |
2007 | notify("Aligned new sequences together") | 2027 | notify("Aligned new sequences together") |
2008 | 2028 | ||
2029 | + # Detect doublons and remove them | ||
2030 | + existing_stk = AlignIO.parse(existing_ali_path, "stk") | ||
2031 | + existing_ids = [ r.id for r in existing_stk ] | ||
2032 | + del existing_stk | ||
2033 | + new_stk = AlignIO.parse(new_ali_path, "stk") | ||
2034 | + new_ids = [ r.id for r in new_stk ] | ||
2035 | + del new_stk | ||
2036 | + doublons = [ i for i in existing_ids if i in new_ids ] | ||
2037 | + del existing_ids, new_ids | ||
2038 | + if len(doublons): | ||
2039 | + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | ||
2040 | + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | ||
2041 | + toremove.write('\n'.join(doublons)+'\n') | ||
2042 | + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path], | ||
2043 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2044 | + os.remove(path_to_seq_data + "realigned/toremove.txt") | ||
2045 | + | ||
2009 | # And we merge the two alignments | 2046 | # And we merge the two alignments |
2010 | - p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", "--rna", | 2047 | + p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", |
2011 | - path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2048 | + "--rna", existing_ali_path, new_ali_path ], |
2012 | - path_to_seq_data + f"realigned/{rfam_acc}_new.stk" ], | ||
2013 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2049 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2014 | stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | 2050 | stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') |
2015 | - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | 2051 | + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) |
2016 | notify("Merged alignments into one") | 2052 | notify("Merged alignments into one") |
2017 | 2053 | ||
2018 | # remove the partial files | 2054 | # remove the partial files |
2019 | - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.stk") | 2055 | + os.remove(new_ali_path) |
2020 | os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") | 2056 | os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") |
2021 | 2057 | ||
2022 | else: | 2058 | else: |
... | @@ -2041,7 +2077,7 @@ def work_realign(rfam_acc): | ... | @@ -2041,7 +2077,7 @@ def work_realign(rfam_acc): |
2041 | 2077 | ||
2042 | # Convert Stockholm to aligned FASTA | 2078 | # Convert Stockholm to aligned FASTA |
2043 | subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | 2079 | subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) |
2044 | - subprocess.run(["rm", "-f", "esltmp*"]) | 2080 | + subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. |
2045 | 2081 | ||
2046 | # Assert everything worked, or save an error | 2082 | # Assert everything worked, or save an error |
2047 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output: | 2083 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output: |
... | @@ -2248,6 +2284,8 @@ if __name__ == "__main__": | ... | @@ -2248,6 +2284,8 @@ if __name__ == "__main__": |
2248 | print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") | 2284 | print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") |
2249 | if len(no_nts_set): | 2285 | if len(no_nts_set): |
2250 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) | 2286 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) |
2287 | + if len(weird_mappings): | ||
2288 | + print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) | ||
2251 | pp.checkpoint_save_chains() | 2289 | pp.checkpoint_save_chains() |
2252 | 2290 | ||
2253 | if not pp.HOMOLOGY: | 2291 | if not pp.HOMOLOGY: |
... | @@ -2280,7 +2318,6 @@ if __name__ == "__main__": | ... | @@ -2280,7 +2318,6 @@ if __name__ == "__main__": |
2280 | pp.prepare_sequences() | 2318 | pp.prepare_sequences() |
2281 | pp.realign() | 2319 | pp.realign() |
2282 | 2320 | ||
2283 | - | ||
2284 | # At this point, the family table is up to date | 2321 | # At this point, the family table is up to date |
2285 | 2322 | ||
2286 | thr_idx_mgr = Manager() | 2323 | thr_idx_mgr = Manager() | ... | ... |
-
Please register or login to post a comment