Louis BECQUEY

investigating alignment errors


Former-commit-id: a95cfdfe
Showing 1 changed file with 23 additions and 7 deletions
...@@ -2593,20 +2593,32 @@ def use_infernal(rfam_acc, alignopts): ...@@ -2593,20 +2593,32 @@ def use_infernal(rfam_acc, alignopts):
2593 new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" 2593 new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
2594 2594
2595 # Align the new sequences 2595 # Align the new sequences
2596 - with open(new_ali_path, 'w') as o: 2596 + with open(path_to_seq_data + f"realigned/{rfam_acc}_new.log", 'w') as o:
2597 p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 2597 p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2598 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2598 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2599 - "-o", path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2599 + "-o", path_to_seq_data + f"realigned/{rfam_acc}_new.stk",
2600 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2600 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2601 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2601 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
2602 stdout=o, stderr=subprocess.PIPE) 2602 stdout=o, stderr=subprocess.PIPE)
2603 + if "--mxsize" in p1.stderr.decode("utf-8"):
2604 + # not enough available RAM to allocate the DP matrix
2605 + warn(f"Not enough RAM to allocate cmalign DP matrix for family {rfam_acc}. Use --sina or --cmalign-opts.", error=True)
2603 notify("Aligned new sequences together") 2606 notify("Aligned new sequences together")
2604 2607
2605 # Detect doublons and remove them 2608 # Detect doublons and remove them
2606 - existing_stk = AlignIO.read(existing_ali_path, "stockholm") 2609 + try:
2610 + existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2611 + except ValueError:
2612 + # Not a stockholm file
2613 + warn(f"Existing alignment is not a Stockholm file !", error=True)
2614 + return
2607 existing_ids = [r.id for r in existing_stk] 2615 existing_ids = [r.id for r in existing_stk]
2608 del existing_stk 2616 del existing_stk
2609 - new_stk = AlignIO.read(new_ali_path, "stockholm") 2617 + try:
2618 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2619 + except ValueError:
2620 + # Not a stockholm file
2621 + warn(f"New alignment {new_ali_path} is not a Stockholm file !", error=True)
2610 new_ids = [r.id for r in new_stk] 2622 new_ids = [r.id for r in new_stk]
2611 del new_stk 2623 del new_stk
2612 doublons = [i for i in existing_ids if i in new_ids] 2624 doublons = [i for i in existing_ids if i in new_ids]
...@@ -2622,10 +2634,14 @@ def use_infernal(rfam_acc, alignopts): ...@@ -2622,10 +2634,14 @@ def use_infernal(rfam_acc, alignopts):
2622 os.remove(path_to_seq_data + "realigned/toremove.txt") 2634 os.remove(path_to_seq_data + "realigned/toremove.txt")
2623 2635
2624 # And we merge the two alignments 2636 # And we merge the two alignments
2625 - p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", 2637 + p2 = subprocess.run(["cmalign", "--merge" "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2626 - "--rna", existing_ali_path, new_ali_path], 2638 + "--rna", path_to_seq_data + f"realigned/{rfam_acc}.cm", existing_ali_path, new_ali_path],
2627 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2639 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2628 - stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') 2640 + alignErrors = p1.stderr.decode('utf-8')
2641 + mergeErrors = p2.stderr.decode('utf-8')
2642 + alignErrors = "Alignment: "+ alignErrors if len(alignErrors) else ""
2643 + mergeErrors = "Alignment: "+ mergeErrors if len(mergeErrors) else ""
2644 + stderr = alignErrors + mergeErrors
2629 subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) 2645 subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2630 notify("Merged alignments into one") 2646 notify("Merged alignments into one")
2631 2647
......