Showing
1 changed file
with
18 additions
and
26 deletions
... | @@ -216,7 +216,7 @@ class Chain: | ... | @@ -216,7 +216,7 @@ class Chain: |
216 | else: | 216 | else: |
217 | t = float(nt["theta_prime"]) | 217 | t = float(nt["theta_prime"]) |
218 | p = int(nt["nt_resnum"]) - resnum_start | 218 | p = int(nt["nt_resnum"]) - resnum_start |
219 | - mask[p] = int(nt["nt_code"] in "ACGU") # U is a U, u is a modified U | 219 | + mask[p] = int(nt["nt_code"] in "ACGU") # U is a U, u is a modified U and should be replaced by consensus ? |
220 | seq[p] = nt["nt_code"].upper() # to align the chain with its family. The final nt should be taken from the PSSM. | 220 | seq[p] = nt["nt_code"].upper() # to align the chain with its family. The final nt should be taken from the PSSM. |
221 | eta[p] = e | 221 | eta[p] = e |
222 | theta[p] = t | 222 | theta[p] = t |
... | @@ -826,7 +826,7 @@ def alignment_nt_stats(f): | ... | @@ -826,7 +826,7 @@ def alignment_nt_stats(f): |
826 | # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing), | 826 | # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing), |
827 | # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and . | 827 | # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and . |
828 | 828 | ||
829 | - if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps) | 829 | + if c.seq[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps) |
830 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) | 830 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) |
831 | i += 1 | 831 | i += 1 |
832 | j += 1 | 832 | j += 1 |
... | @@ -834,7 +834,7 @@ def alignment_nt_stats(f): | ... | @@ -834,7 +834,7 @@ def alignment_nt_stats(f): |
834 | 834 | ||
835 | # search for a gap to the consensus nearby | 835 | # search for a gap to the consensus nearby |
836 | k = 0 | 836 | k = 0 |
837 | - while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']: | 837 | + while j+k<alilen and s.seq[j+k] in ['.','-']: |
838 | if s.seq[j+k] == '-': | 838 | if s.seq[j+k] == '-': |
839 | break | 839 | break |
840 | k += 1 | 840 | k += 1 |
... | @@ -845,15 +845,7 @@ def alignment_nt_stats(f): | ... | @@ -845,15 +845,7 @@ def alignment_nt_stats(f): |
845 | continue | 845 | continue |
846 | 846 | ||
847 | # if not, search for a insertion gap nearby | 847 | # if not, search for a insertion gap nearby |
848 | - k = 0 | 848 | + if j<alilen and s.seq[j] == '.': |
849 | - while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']: | ||
850 | - if s.seq[j+k] == '.': | ||
851 | - break | ||
852 | - k += 1 | ||
853 | - | ||
854 | - # if found, set j to that position | ||
855 | - if j+k<alilen and s.seq[j+k] == '.': | ||
856 | - j = j + k | ||
857 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) | 849 | rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) |
858 | i += 1 | 850 | i += 1 |
859 | j += 1 | 851 | j += 1 |
... | @@ -866,7 +858,8 @@ def alignment_nt_stats(f): | ... | @@ -866,7 +858,8 @@ def alignment_nt_stats(f): |
866 | elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | 858 | elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain |
867 | j += 1 # ignore the column | 859 | j += 1 # ignore the column |
868 | else: | 860 | else: |
869 | - print("You are never supposed to reach this.", c.seq, '\n', s.seq, sep='', flush=True) | 861 | + print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True) |
862 | + exit(1) | ||
870 | if warn_gaps: | 863 | if warn_gaps: |
871 | warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") | 864 | warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") |
872 | 865 | ||
... | @@ -886,11 +879,10 @@ def alignment_nt_stats(f): | ... | @@ -886,11 +879,10 @@ def alignment_nt_stats(f): |
886 | point[0,i] = i+1 # position | 879 | point[0,i] = i+1 # position |
887 | 880 | ||
888 | # one-hot encoding of the actual sequence | 881 | # one-hot encoding of the actual sequence |
889 | - point[1,i] = int(c.seq[i] == 'A') | 882 | + if c.seq[i] in letters[:4]: |
890 | - point[2,i] = int(c.seq[i] == 'C') | 883 | + point[ letters[:4].index(c.seq[i]), i ] = 1 |
891 | - point[3,i] = int(c.seq[i] == 'G') | 884 | + else: |
892 | - point[4,i] = int(c.seq[i] == 'U') | 885 | + point[5,i] = 1 |
893 | - point[5,i] = int(c.seq[i] not in ['A', 'C', 'G', 'U']) | ||
894 | 886 | ||
895 | # save the PSSMs | 887 | # save the PSSMs |
896 | point[6,i] = c.frequencies[0, i] | 888 | point[6,i] = c.frequencies[0, i] |
... | @@ -900,7 +892,7 @@ def alignment_nt_stats(f): | ... | @@ -900,7 +892,7 @@ def alignment_nt_stats(f): |
900 | point[10,i] = c.frequencies[4, i] | 892 | point[10,i] = c.frequencies[4, i] |
901 | point[11,i] = c.etas[i] | 893 | point[11,i] = c.etas[i] |
902 | point[12,i] = c.thetas[i] | 894 | point[12,i] = c.thetas[i] |
903 | - file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") | 895 | + file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") # no check, always overwrite, this is desired |
904 | for i in range(13): | 896 | for i in range(13): |
905 | line = [str(x) for x in list(point[i,:]) ] | 897 | line = [str(x) for x in list(point[i,:]) ] |
906 | file.write(','.join(line)+'\n') | 898 | file.write(','.join(line)+'\n') |
... | @@ -1010,11 +1002,11 @@ if __name__ == "__main__": | ... | @@ -1010,11 +1002,11 @@ if __name__ == "__main__": |
1010 | os.makedirs(path_to_3D_data + "datapoints/") | 1002 | os.makedirs(path_to_3D_data + "datapoints/") |
1011 | print("Computing nucleotide frequencies in alignments...") | 1003 | print("Computing nucleotide frequencies in alignments...") |
1012 | families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF02543"]]) | 1004 | families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF02543"]]) |
1013 | - # pool = Pool(processes=1, maxtasksperchild=10) | 1005 | + pool = Pool(processes=1, maxtasksperchild=10) |
1014 | - # results = pool.map(alignment_nt_stats, families) | 1006 | + results = pool.map(alignment_nt_stats, families) |
1015 | - # pool.close() | 1007 | + pool.close() |
1016 | - # pool.join() | 1008 | + pool.join() |
1017 | - # loaded_chains = list(itertools.chain.from_iterable(results)) | 1009 | + loaded_chains = list(itertools.chain.from_iterable(results)) |
1018 | - for f in families: | 1010 | + # for f in families: |
1019 | - alignment_nt_stats(f) | 1011 | + # alignment_nt_stats(f) |
1020 | print("Completed.") | 1012 | print("Completed.") | ... | ... |
-
Please register or login to post a comment