Louis BECQUEY

better mapping of the gaps onto the alignments

Showing 1 changed file with 18 additions and 26 deletions
...@@ -216,7 +216,7 @@ class Chain: ...@@ -216,7 +216,7 @@ class Chain:
216 else: 216 else:
217 t = float(nt["theta_prime"]) 217 t = float(nt["theta_prime"])
218 p = int(nt["nt_resnum"]) - resnum_start 218 p = int(nt["nt_resnum"]) - resnum_start
219 - mask[p] = int(nt["nt_code"] in "ACGU") # U is a U, u is a modified U 219 + mask[p] = int(nt["nt_code"] in "ACGU") # U is a U, u is a modified U and should be replaced by consensus ?
220 seq[p] = nt["nt_code"].upper() # to align the chain with its family. The final nt should be taken from the PSSM. 220 seq[p] = nt["nt_code"].upper() # to align the chain with its family. The final nt should be taken from the PSSM.
221 eta[p] = e 221 eta[p] = e
222 theta[p] = t 222 theta[p] = t
...@@ -826,7 +826,7 @@ def alignment_nt_stats(f): ...@@ -826,7 +826,7 @@ def alignment_nt_stats(f):
826 # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing), 826 # here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing),
827 # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and . 827 # with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and .
828 828
829 - if c.seq[i] == s[j].upper(): # alignment and sequence correspond (incl. gaps) 829 + if c.seq[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
830 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) 830 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
831 i += 1 831 i += 1
832 j += 1 832 j += 1
...@@ -834,7 +834,7 @@ def alignment_nt_stats(f): ...@@ -834,7 +834,7 @@ def alignment_nt_stats(f):
834 834
835 # search for a gap to the consensus nearby 835 # search for a gap to the consensus nearby
836 k = 0 836 k = 0
837 - while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']: 837 + while j+k<alilen and s.seq[j+k] in ['.','-']:
838 if s.seq[j+k] == '-': 838 if s.seq[j+k] == '-':
839 break 839 break
840 k += 1 840 k += 1
...@@ -845,15 +845,7 @@ def alignment_nt_stats(f): ...@@ -845,15 +845,7 @@ def alignment_nt_stats(f):
845 continue 845 continue
846 846
847 # if not, search for a insertion gap nearby 847 # if not, search for a insertion gap nearby
848 - k = 0 848 + if j<alilen and s.seq[j] == '.':
849 - while j+k<alilen and s.seq[j+k] not in ['A','C','G','U','a','c','g','u','P']:
850 - if s.seq[j+k] == '.':
851 - break
852 - k += 1
853 -
854 - # if found, set j to that position
855 - if j+k<alilen and s.seq[j+k] == '.':
856 - j = j + k
857 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1) 849 rfam_acc_to_download[f][idx].frequencies = np.concatenate((rfam_acc_to_download[f][idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
858 i += 1 850 i += 1
859 j += 1 851 j += 1
...@@ -866,7 +858,8 @@ def alignment_nt_stats(f): ...@@ -866,7 +858,8 @@ def alignment_nt_stats(f):
866 elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain 858 elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
867 j += 1 # ignore the column 859 j += 1 # ignore the column
868 else: 860 else:
869 - print("You are never supposed to reach this.", c.seq, '\n', s.seq, sep='', flush=True) 861 + print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True)
862 + exit(1)
870 if warn_gaps: 863 if warn_gaps:
871 warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") 864 warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.")
872 865
...@@ -886,11 +879,10 @@ def alignment_nt_stats(f): ...@@ -886,11 +879,10 @@ def alignment_nt_stats(f):
886 point[0,i] = i+1 # position 879 point[0,i] = i+1 # position
887 880
888 # one-hot encoding of the actual sequence 881 # one-hot encoding of the actual sequence
889 - point[1,i] = int(c.seq[i] == 'A') 882 + if c.seq[i] in letters[:4]:
890 - point[2,i] = int(c.seq[i] == 'C') 883 + point[ letters[:4].index(c.seq[i]), i ] = 1
891 - point[3,i] = int(c.seq[i] == 'G') 884 + else:
892 - point[4,i] = int(c.seq[i] == 'U') 885 + point[5,i] = 1
893 - point[5,i] = int(c.seq[i] not in ['A', 'C', 'G', 'U'])
894 886
895 # save the PSSMs 887 # save the PSSMs
896 point[6,i] = c.frequencies[0, i] 888 point[6,i] = c.frequencies[0, i]
...@@ -900,7 +892,7 @@ def alignment_nt_stats(f): ...@@ -900,7 +892,7 @@ def alignment_nt_stats(f):
900 point[10,i] = c.frequencies[4, i] 892 point[10,i] = c.frequencies[4, i]
901 point[11,i] = c.etas[i] 893 point[11,i] = c.etas[i]
902 point[12,i] = c.thetas[i] 894 point[12,i] = c.thetas[i]
903 - file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") 895 + file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") # no check, always overwrite, this is desired
904 for i in range(13): 896 for i in range(13):
905 line = [str(x) for x in list(point[i,:]) ] 897 line = [str(x) for x in list(point[i,:]) ]
906 file.write(','.join(line)+'\n') 898 file.write(','.join(line)+'\n')
...@@ -1010,11 +1002,11 @@ if __name__ == "__main__": ...@@ -1010,11 +1002,11 @@ if __name__ == "__main__":
1010 os.makedirs(path_to_3D_data + "datapoints/") 1002 os.makedirs(path_to_3D_data + "datapoints/")
1011 print("Computing nucleotide frequencies in alignments...") 1003 print("Computing nucleotide frequencies in alignments...")
1012 families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF02543"]]) 1004 families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF02543"]])
1013 - # pool = Pool(processes=1, maxtasksperchild=10) 1005 + pool = Pool(processes=1, maxtasksperchild=10)
1014 - # results = pool.map(alignment_nt_stats, families) 1006 + results = pool.map(alignment_nt_stats, families)
1015 - # pool.close() 1007 + pool.close()
1016 - # pool.join() 1008 + pool.join()
1017 - # loaded_chains = list(itertools.chain.from_iterable(results)) 1009 + loaded_chains = list(itertools.chain.from_iterable(results))
1018 - for f in families: 1010 + # for f in families:
1019 - alignment_nt_stats(f) 1011 + # alignment_nt_stats(f)
1020 print("Completed.") 1012 print("Completed.")
......