Louis BECQUEY

check for doublons when merging alignments

Showing 1 changed file with 59 additions and 22 deletions
...@@ -42,6 +42,7 @@ errsymb = '\U0000274C' ...@@ -42,6 +42,7 @@ errsymb = '\U0000274C'
42 LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112 42 LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112
43 SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111 43 SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111
44 no_nts_set = set() 44 no_nts_set = set()
45 +weird_mappings = set()
45 46
46 class NtPortionSelector(object): 47 class NtPortionSelector(object):
47 """Class passed to MMCIFIO to select some chain portions in an MMCIF file. 48 """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
...@@ -273,13 +274,6 @@ class Chain: ...@@ -273,13 +274,6 @@ class Chain:
273 self.error_messages = f"Error while parsing DSSR's json output:\n{e}" 274 self.error_messages = f"Error while parsing DSSR's json output:\n{e}"
274 return 1 275 return 1
275 276
276 - # Remove nucleotides of the chain that are outside the Rfam mapping, if any
277 - if self.pdb_start and self.pdb_end:
278 - if self.pdb_start < self.pdb_end:
279 - df = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index)
280 - else:
281 - df = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index)
282 -
283 ############################################# 277 #############################################
284 # Solve some common issues and drop ligands 278 # Solve some common issues and drop ligands
285 ############################################# 279 #############################################
...@@ -298,6 +292,9 @@ class Chain: ...@@ -298,6 +292,9 @@ class Chain:
298 or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])): 292 or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])):
299 df = df.head(-1) 293 df = df.head(-1)
300 294
295 + # drop eventual nts with index_chain < the first residue (usually, ligands)
296 + df = df.drop(df[df.index_chain < 0].index)
297 +
301 # Assert some nucleotides still exist 298 # Assert some nucleotides still exist
302 try: 299 try:
303 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view 300 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view
...@@ -334,13 +331,31 @@ class Chain: ...@@ -334,13 +331,31 @@ class Chain:
334 df.iloc[i+1:, 1] += 1 331 df.iloc[i+1:, 1] += 1
335 else: 332 else:
336 warn(f"Missing index_chain {i} in {self.chain_label} !") 333 warn(f"Missing index_chain {i} in {self.chain_label} !")
337 - df = df.drop(df[df.index_chain < 0].index) # drop eventual ones with index_chain < the first residue (usually, ligands)
338 334
339 - # Re-Assert some nucleotides still exist 335 + # Remove nucleotides of the chain that are outside the Rfam mapping, if any
336 + if self.pdb_start and self.pdb_end:
337 + if self.pdb_start < self.pdb_end:
338 + newdf = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index)
339 + else:
340 + newdf = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index)
341 +
342 + if len(newdf.index_chain) > 0:
343 + # everything's okay
344 + df = newdf
345 + else:
346 + # There were nucleotides in this chain but we removed them all while
347 + # filtering the ones outside the Rfam mapping.
348 + # This probably means that, for this chain, the mapping is relative to
349 + # index_chain and not nt_resnum.
350 + warn(f"Assuming {self.chain_label}'s mapping to {self.rfam_fam} is an absolute position interval.")
351 + weird_mappings.add(self.chain_label + "." + self.rfam_fam)
352 + df = df.drop(df[(df.index_chain < self.pdb_start) | (df.index_chain > self.pdb_end)].index)
353 +
340 try: 354 try:
341 - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view 355 + l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view
342 except IndexError: 356 except IndexError:
343 - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) 357 + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and "
358 + f"{self.pdb_end} ({'not' if not self.inferred else ''} inferred). Ignoring chain {self.chain_label}.", error=True)
344 no_nts_set.add(self.pdb_id) 359 no_nts_set.add(self.pdb_id)
345 self.delete_me = True 360 self.delete_me = True
346 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 361 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
...@@ -360,7 +375,7 @@ class Chain: ...@@ -360,7 +375,7 @@ class Chain:
360 # portion solved in 3D 1 |--------------|79 85|------------| 156 375 # portion solved in 3D 1 |--------------|79 85|------------| 156
361 # Rfam mapping 3 |------------------------------------------ ... -------| 3353 (yes larger, 'cause it could be inferred) 376 # Rfam mapping 3 |------------------------------------------ ... -------| 3353 (yes larger, 'cause it could be inferred)
362 # nt resnum 3 |--------------------------------| 156 377 # nt resnum 3 |--------------------------------| 156
363 - # index_chain 1 |-------------|77 83|------------| 149 (before correction) 378 + # index_chain 1 |-------------|77 83|------------| 154
364 # expected data point 1 |--------------------------------| 154 379 # expected data point 1 |--------------------------------| 154
365 # 380 #
366 381
...@@ -537,6 +552,8 @@ class Chain: ...@@ -537,6 +552,8 @@ class Chain:
537 warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True) 552 warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
538 self.delete_me = True 553 self.delete_me = True
539 self.error_messages = "Sequence is too short. (< 5 resolved nts)" 554 self.error_messages = "Sequence is too short. (< 5 resolved nts)"
555 + return 1
556 +
540 return 0 557 return 0
541 558
542 def remap(self, columns_to_save, s_seq): 559 def remap(self, columns_to_save, s_seq):
...@@ -1352,10 +1369,10 @@ class Pipeline: ...@@ -1352,10 +1369,10 @@ class Pipeline:
1352 if self.HOMOLOGY: 1369 if self.HOMOLOGY:
1353 # check if chains have been re_mapped: 1370 # check if chains have been re_mapped:
1354 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain 1371 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain
1355 - WHERE chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) 1372 + WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
1356 GROUP BY rfam_acc;""") 1373 GROUP BY rfam_acc;""")
1357 if len(r) and r[0][0] is not None: 1374 if len(r) and r[0][0] is not None:
1358 - warn("Chains were not remapped (This happens if we have known issues for example):") 1375 + warn("Chains were not remapped:")
1359 for x in r: 1376 for x in r:
1360 print(str(x[0]) + " chains of family " + x[1]) 1377 print(str(x[0]) + " chains of family " + x[1])
1361 1378
...@@ -1999,24 +2016,43 @@ def work_realign(rfam_acc): ...@@ -1999,24 +2016,43 @@ def work_realign(rfam_acc):
1999 # there are no new sequences to align... 2016 # there are no new sequences to align...
2000 return 2017 return
2001 2018
2019 + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
2020 + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
2021 +
2002 # Align the new sequences 2022 # Align the new sequences
2003 - with open(path_to_seq_data + f"realigned/{rfam_acc}_new.stk", 'w') as o: 2023 + with open(new_ali_path, 'w') as o:
2004 p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", 2024 p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm",
2005 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2025 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
2006 stdout=o, stderr=subprocess.PIPE) 2026 stdout=o, stderr=subprocess.PIPE)
2007 notify("Aligned new sequences together") 2027 notify("Aligned new sequences together")
2008 2028
2029 + # Detect doublons and remove them
2030 + existing_stk = AlignIO.parse(existing_ali_path, "stk")
2031 + existing_ids = [ r.id for r in existing_stk ]
2032 + del existing_stk
2033 + new_stk = AlignIO.parse(new_ali_path, "stk")
2034 + new_ids = [ r.id for r in new_stk ]
2035 + del new_stk
2036 + doublons = [ i for i in existing_ids if i in new_ids ]
2037 + del existing_ids, new_ids
2038 + if len(doublons):
2039 + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2040 + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2041 + toremove.write('\n'.join(doublons)+'\n')
2042 + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path],
2043 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2044 + os.remove(path_to_seq_data + "realigned/toremove.txt")
2045 +
2009 # And we merge the two alignments 2046 # And we merge the two alignments
2010 - p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", "--rna", 2047 + p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2011 - path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2048 + "--rna", existing_ali_path, new_ali_path ],
2012 - path_to_seq_data + f"realigned/{rfam_acc}_new.stk" ],
2013 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2049 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2014 stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') 2050 stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2015 - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) 2051 + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2016 notify("Merged alignments into one") 2052 notify("Merged alignments into one")
2017 2053
2018 # remove the partial files 2054 # remove the partial files
2019 - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.stk") 2055 + os.remove(new_ali_path)
2020 os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") 2056 os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
2021 2057
2022 else: 2058 else:
...@@ -2041,7 +2077,7 @@ def work_realign(rfam_acc): ...@@ -2041,7 +2077,7 @@ def work_realign(rfam_acc):
2041 2077
2042 # Convert Stockholm to aligned FASTA 2078 # Convert Stockholm to aligned FASTA
2043 subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) 2079 subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
2044 - subprocess.run(["rm", "-f", "esltmp*"]) 2080 + subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
2045 2081
2046 # Assert everything worked, or save an error 2082 # Assert everything worked, or save an error
2047 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output: 2083 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output:
...@@ -2248,6 +2284,8 @@ if __name__ == "__main__": ...@@ -2248,6 +2284,8 @@ if __name__ == "__main__":
2248 print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") 2284 print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
2249 if len(no_nts_set): 2285 if len(no_nts_set):
2250 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) 2286 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
2287 + if len(weird_mappings):
2288 + print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
2251 pp.checkpoint_save_chains() 2289 pp.checkpoint_save_chains()
2252 2290
2253 if not pp.HOMOLOGY: 2291 if not pp.HOMOLOGY:
...@@ -2280,7 +2318,6 @@ if __name__ == "__main__": ...@@ -2280,7 +2318,6 @@ if __name__ == "__main__":
2280 pp.prepare_sequences() 2318 pp.prepare_sequences()
2281 pp.realign() 2319 pp.realign()
2282 2320
2283 -
2284 # At this point, the family table is up to date 2321 # At this point, the family table is up to date
2285 2322
2286 thr_idx_mgr = Manager() 2323 thr_idx_mgr = Manager()
......