Showing
1 changed file
with
66 additions
and
31 deletions
... | @@ -723,7 +723,11 @@ def cm_realign(rfam_acc, chains, label): | ... | @@ -723,7 +723,11 @@ def cm_realign(rfam_acc, chains, label): |
723 | f.write(">"+record.description+'\n'+str(record.seq)+'\n') | 723 | f.write(">"+record.description+'\n'+str(record.seq)+'\n') |
724 | ids.append(record.id) | 724 | ids.append(record.id) |
725 | for c in chains: | 725 | for c in chains: |
726 | - f.write(f"> {str(c)}\n"+c.seq.replace('U','T').replace('-','')+'\n') # We align as DNA | 726 | + seq_str = c.seq.replace('U','T').replace('-','') # We align as DNA |
727 | + if rfam_acc in ["RF02541", "RF02543"]: | ||
728 | + seq_str = seq_str.replace('P','U') # Replace pseudo-uridines by uridines. We lose information here but SINA does not accept them. | ||
729 | + f.write(f"> {str(c)}\n"+seq_str+'\n') | ||
730 | + | ||
727 | f.close() | 731 | f.close() |
728 | 732 | ||
729 | if rfam_acc not in ["RF02541", "RF02543"]: | 733 | if rfam_acc not in ["RF02541", "RF02543"]: |
... | @@ -817,52 +821,83 @@ def alignment_nt_stats(f): | ... | @@ -817,52 +821,83 @@ def alignment_nt_stats(f): |
817 | # Save colums in the appropriate positions | 821 | # Save colums in the appropriate positions |
818 | i = 0 | 822 | i = 0 |
819 | j = 0 | 823 | j = 0 |
820 | - while i<len(c.seq) and j<alilen: | 824 | + warn_gaps = False |
825 | + while i<c.full_length and j<alilen: | ||
826 | + # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing), | ||
827 | + # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and . | ||
828 | + | ||
821 | if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps) | 829 | if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps) |
822 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) | 830 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) |
823 | i += 1 | 831 | i += 1 |
824 | j += 1 | 832 | j += 1 |
825 | - elif s[j] in ['.', '-']: # gap in the alignment, but not in the real chain | 833 | + elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence |
826 | - j += 1 # ignore the column | 834 | + |
827 | - elif c.seq[i] == '-': # gap in the chain, but the sequence aligns well... | 835 | + # search for a gap to the consensus nearby |
828 | - warn(f"gap in {c.chain_label} not re-found in the aligned sequence... Ignoring it.") | 836 | + k = 0 |
837 | + while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']: | ||
838 | + if s.seq[j+k] == '-': | ||
839 | + break | ||
840 | + k += 1 | ||
841 | + | ||
842 | + # if found, set j to that position | ||
843 | + if j+k<alilen and s.seq[j+k] == '-': | ||
844 | + j = j + k | ||
845 | + continue | ||
846 | + | ||
847 | + # if not, search for a insertion gap nearby | ||
848 | + k = 0 | ||
849 | + while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']: | ||
850 | + if s.seq[j+k] == '.': | ||
851 | + break | ||
852 | + k += 1 | ||
853 | + | ||
854 | + # if found, set j to that position | ||
855 | + if j+k<alilen and s.seq[j+k] == '.': | ||
856 | + j = j + k | ||
857 | + rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) | ||
858 | + i += 1 | ||
859 | + j += 1 | ||
860 | + continue | ||
861 | + | ||
862 | + # else, just ignore the gap. | ||
863 | + warn_gaps = True | ||
829 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1) | 864 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1) |
830 | i += 1 | 865 | i += 1 |
866 | + elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | ||
867 | + j += 1 # ignore the column | ||
831 | else: | 868 | else: |
832 | - print("You are never supposed to reach this:", c.seq, '\n', s, flush=True) | 869 | + print("You are never supposed to reach this.", c.seq, '\n', s.seq, sep='', flush=True) |
833 | - | 870 | + if warn_gaps: |
871 | + warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") | ||
872 | + | ||
834 | # Replace masked positions by the consensus sequence: | 873 | # Replace masked positions by the consensus sequence: |
835 | - s = c.seq.split() | 874 | + c_seq = c.seq.split() |
836 | letters = ['A', 'C', 'G', 'U', 'N'] | 875 | letters = ['A', 'C', 'G', 'U', 'N'] |
837 | - for i in range(len(s)): | 876 | + for i in range(c.full_length): |
838 | if not c.mask[i]: | 877 | if not c.mask[i]: |
839 | freq = rfam_acc_to_download[f][idx].frequencies[:,i] | 878 | freq = rfam_acc_to_download[f][idx].frequencies[:,i] |
840 | - s[i] = letters[freq.tolist().index(max(freq))] | 879 | + c_seq[i] = letters[freq.tolist().index(max(freq))] |
841 | - rfam_acc_to_download[f][idx].seq = ''.join(s) | 880 | + rfam_acc_to_download[f][idx].seq = ''.join(c_seq) |
842 | 881 | ||
843 | # Saving 'final' datapoint | 882 | # Saving 'final' datapoint |
883 | + c = rfam_acc_to_download[f][idx] # update the local object | ||
844 | point = np.zeros((13, c.full_length)) | 884 | point = np.zeros((13, c.full_length)) |
845 | - gaps = 0 | ||
846 | for i in range(c.full_length): | 885 | for i in range(c.full_length): |
847 | point[0,i] = i+1 # position | 886 | point[0,i] = i+1 # position |
848 | - if c.mask[i]: # the ith nucleotide exists | 887 | + |
849 | - | 888 | + # one-hot encoding of the actual sequence |
850 | - # one-hot encoding of the actual sequence | 889 | + point[1,i] = int(c.seq[i] == 'A') |
851 | - point[1,i] = int(c.seq[i-gaps] == 'A') | 890 | + point[2,i] = int(c.seq[i] == 'C') |
852 | - point[2,i] = int(c.seq[i-gaps] == 'C') | 891 | + point[3,i] = int(c.seq[i] == 'G') |
853 | - point[3,i] = int(c.seq[i-gaps] == 'G') | 892 | + point[4,i] = int(c.seq[i] == 'U') |
854 | - point[4,i] = int(c.seq[i-gaps] == 'U') | 893 | + point[5,i] = int(c.seq[i] not in ['A', 'C', 'G', 'U']) |
855 | - point[5,i] = int(c.seq[i-gaps] not in ['A', 'C', 'G', 'U']) | 894 | + |
856 | - | 895 | + # save the PSSMs |
857 | - # save the PSSMs | 896 | + point[6,i] = c.frequencies[0, i] |
858 | - point[6,i] = c.frequencies[0, i-gaps] | 897 | + point[7,i] = c.frequencies[1, i] |
859 | - point[7,i] = c.frequencies[1, i-gaps] | 898 | + point[8,i] = c.frequencies[2, i] |
860 | - point[8,i] = c.frequencies[2, i-gaps] | 899 | + point[9,i] = c.frequencies[3, i] |
861 | - point[9,i] = c.frequencies[3, i-gaps] | 900 | + point[10,i] = c.frequencies[4, i] |
862 | - point[10,i] = c.frequencies[4, i-gaps] | ||
863 | - else: | ||
864 | - gaps += 1 | ||
865 | - point[5,i] = 1.0 | ||
866 | point[11,i] = c.etas[i] | 901 | point[11,i] = c.etas[i] |
867 | point[12,i] = c.thetas[i] | 902 | point[12,i] = c.thetas[i] |
868 | file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") | 903 | file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") | ... | ... |
-
Please register or login to post a comment