Louis BECQUEY

Correct alignment of mappings

Showing 1 changed file with 66 additions and 31 deletions
...@@ -723,7 +723,11 @@ def cm_realign(rfam_acc, chains, label): ...@@ -723,7 +723,11 @@ def cm_realign(rfam_acc, chains, label):
723 f.write(">"+record.description+'\n'+str(record.seq)+'\n') 723 f.write(">"+record.description+'\n'+str(record.seq)+'\n')
724 ids.append(record.id) 724 ids.append(record.id)
725 for c in chains: 725 for c in chains:
726 - f.write(f"> {str(c)}\n"+c.seq.replace('U','T').replace('-','')+'\n') # We align as DNA 726 + seq_str = c.seq.replace('U','T').replace('-','') # We align as DNA
727 + if rfam_acc in ["RF02541", "RF02543"]:
728 + seq_str = seq_str.replace('P','U') # Replace pseudo-uridines by uridines. We lose information here but SINA does not accept them.
729 + f.write(f"> {str(c)}\n"+seq_str+'\n')
730 +
727 f.close() 731 f.close()
728 732
729 if rfam_acc not in ["RF02541", "RF02543"]: 733 if rfam_acc not in ["RF02541", "RF02543"]:
...@@ -817,52 +821,83 @@ def alignment_nt_stats(f): ...@@ -817,52 +821,83 @@ def alignment_nt_stats(f):
817 # Save colums in the appropriate positions 821 # Save colums in the appropriate positions
818 i = 0 822 i = 0
819 j = 0 823 j = 0
820 - while i<len(c.seq) and j<alilen: 824 + warn_gaps = False
825 + while i<c.full_length and j<alilen:
826 + # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing),
827 + # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and .
828 +
821 if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps) 829 if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps)
822 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) 830 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
823 i += 1 831 i += 1
824 j += 1 832 j += 1
825 - elif s[j] in ['.', '-']: # gap in the alignment, but not in the real chain 833 + elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence
826 - j += 1 # ignore the column 834 +
827 - elif c.seq[i] == '-': # gap in the chain, but the sequence aligns well... 835 + # search for a gap to the consensus nearby
828 - warn(f"gap in {c.chain_label} not re-found in the aligned sequence... Ignoring it.") 836 + k = 0
837 + while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']:
838 + if s.seq[j+k] == '-':
839 + break
840 + k += 1
841 +
842 + # if found, set j to that position
843 + if j+k<alilen and s.seq[j+k] == '-':
844 + j = j + k
845 + continue
846 +
847 + # if not, search for a insertion gap nearby
848 + k = 0
849 + while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']:
850 + if s.seq[j+k] == '.':
851 + break
852 + k += 1
853 +
854 + # if found, set j to that position
855 + if j+k<alilen and s.seq[j+k] == '.':
856 + j = j + k
857 + rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
858 + i += 1
859 + j += 1
860 + continue
861 +
862 + # else, just ignore the gap.
863 + warn_gaps = True
829 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1) 864 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1)
830 i += 1 865 i += 1
866 + elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
867 + j += 1 # ignore the column
831 else: 868 else:
832 - print("You are never supposed to reach this:", c.seq, '\n', s, flush=True) 869 + print("You are never supposed to reach this.", c.seq, '\n', s.seq, sep='', flush=True)
833 - 870 + if warn_gaps:
871 + warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.")
872 +
834 # Replace masked positions by the consensus sequence: 873 # Replace masked positions by the consensus sequence:
835 - s = c.seq.split() 874 + c_seq = c.seq.split()
836 letters = ['A', 'C', 'G', 'U', 'N'] 875 letters = ['A', 'C', 'G', 'U', 'N']
837 - for i in range(len(s)): 876 + for i in range(c.full_length):
838 if not c.mask[i]: 877 if not c.mask[i]:
839 freq = rfam_acc_to_download[f][idx].frequencies[:,i] 878 freq = rfam_acc_to_download[f][idx].frequencies[:,i]
840 - s[i] = letters[freq.tolist().index(max(freq))] 879 + c_seq[i] = letters[freq.tolist().index(max(freq))]
841 - rfam_acc_to_download[f][idx].seq = ''.join(s) 880 + rfam_acc_to_download[f][idx].seq = ''.join(c_seq)
842 881
843 # Saving 'final' datapoint 882 # Saving 'final' datapoint
883 + c = rfam_acc_to_download[f][idx] # update the local object
844 point = np.zeros((13, c.full_length)) 884 point = np.zeros((13, c.full_length))
845 - gaps = 0
846 for i in range(c.full_length): 885 for i in range(c.full_length):
847 point[0,i] = i+1 # position 886 point[0,i] = i+1 # position
848 - if c.mask[i]: # the ith nucleotide exists 887 +
849 - 888 + # one-hot encoding of the actual sequence
850 - # one-hot encoding of the actual sequence 889 + point[1,i] = int(c.seq[i] == 'A')
851 - point[1,i] = int(c.seq[i-gaps] == 'A') 890 + point[2,i] = int(c.seq[i] == 'C')
852 - point[2,i] = int(c.seq[i-gaps] == 'C') 891 + point[3,i] = int(c.seq[i] == 'G')
853 - point[3,i] = int(c.seq[i-gaps] == 'G') 892 + point[4,i] = int(c.seq[i] == 'U')
854 - point[4,i] = int(c.seq[i-gaps] == 'U') 893 + point[5,i] = int(c.seq[i] not in ['A', 'C', 'G', 'U'])
855 - point[5,i] = int(c.seq[i-gaps] not in ['A', 'C', 'G', 'U']) 894 +
856 - 895 + # save the PSSMs
857 - # save the PSSMs 896 + point[6,i] = c.frequencies[0, i]
858 - point[6,i] = c.frequencies[0, i-gaps] 897 + point[7,i] = c.frequencies[1, i]
859 - point[7,i] = c.frequencies[1, i-gaps] 898 + point[8,i] = c.frequencies[2, i]
860 - point[8,i] = c.frequencies[2, i-gaps] 899 + point[9,i] = c.frequencies[3, i]
861 - point[9,i] = c.frequencies[3, i-gaps] 900 + point[10,i] = c.frequencies[4, i]
862 - point[10,i] = c.frequencies[4, i-gaps]
863 - else:
864 - gaps += 1
865 - point[5,i] = 1.0
866 point[11,i] = c.etas[i] 901 point[11,i] = c.etas[i]
867 point[12,i] = c.thetas[i] 902 point[12,i] = c.thetas[i]
868 file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") 903 file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w")
......