Louis BECQUEY

Distance matrices of 3D data

...@@ -6,14 +6,17 @@ import threading as th ...@@ -6,14 +6,17 @@ import threading as th
6 import seaborn as sb 6 import seaborn as sb
7 import scipy.stats as st 7 import scipy.stats as st
8 import matplotlib.pyplot as plt 8 import matplotlib.pyplot as plt
9 -import matplotlib.patches as ptch 9 +import pylab
10 +import scipy.cluster.hierarchy as sch
11 +from scipy.spatial.distance import squareform
10 from mpl_toolkits.mplot3d import axes3d 12 from mpl_toolkits.mplot3d import axes3d
11 from Bio.Phylo.TreeConstruction import DistanceCalculator 13 from Bio.Phylo.TreeConstruction import DistanceCalculator
12 -from Bio import AlignIO 14 +from Bio import AlignIO, SeqIO
13 from matplotlib import cm 15 from matplotlib import cm
14 from tqdm import tqdm 16 from tqdm import tqdm
15 from functools import partial 17 from functools import partial
16 from multiprocessing import Pool 18 from multiprocessing import Pool
19 +from os import path
17 from RNAnet import read_cpu_number 20 from RNAnet import read_cpu_number
18 21
19 22
...@@ -182,14 +185,52 @@ def stats_len(mappings_list, points): ...@@ -182,14 +185,52 @@ def stats_len(mappings_list, points):
182 # ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14) 185 # ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14)
183 plt.savefig("results/full_length_distribs.png") 186 plt.savefig("results/full_length_distribs.png")
184 187
185 -def seq_idty(mappings_list): 188 +def to_dist_matrix(f):
186 - idty_matrix = {} 189 + print(f)
187 dm = DistanceCalculator('identity') 190 dm = DistanceCalculator('identity')
188 - for f in mappings_list.keys(): 191 + with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file:
189 - with open(path_to_seq_data+"realigned/"+f+"++.stk") as al_file: 192 + al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
190 - al = AlignIO.read(al_file, "stockholm") 193 + idty = dm.get_distance(al).matrix # list of lists
191 - idty_matrix[f] = dm.get_distance(al) 194 + del al
195 + l = len(idty)
196 + np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ]))
197 + del idty
198 + return 0
192 199
200 +def seq_idty(mappings_list):
201 + fam_arrays = []
202 + for f in sorted(mappings_list.keys()):
203 + if path.isfile("data/"+f+".npy"):
204 + fam_arrays.append(np.load("data/"+f+".npy"))
205 + else:
206 + # to_dist_matrix(f)
207 + # fam_arrays.append(np.load("data/"+f+".npy"))
208 + fam_arrays.append([])
209 +
210 + fig, axs = plt.subplots(11,7, figsize=(25,25))
211 + axs = axs.ravel()
212 + [axi.set_axis_off() for axi in axs]
213 + for f, D, ax in zip(sorted(mappings_list.keys()), fam_arrays, axs):
214 + if not len(D): continue
215 + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
216 + D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
217 + condensedD = squareform(D)
218 +
219 + # Compute basic dendrogram by Ward's method
220 + Y = sch.linkage(condensedD, method='ward')
221 + Z = sch.dendrogram(Y, orientation='left', no_plot=True)
222 +
223 + # Reorganize rows and cols
224 + idx1 = Z['leaves']
225 + D = D[idx1,:]
226 + D = D[:,idx1[::-1]]
227 + im = ax.matshow(D, vmin=0, vmax=1, origin='lower')
228 + ax.set_title(f)
229 + fig.suptitle("Distance matrices of sequences from various families\nclustered by sequence identity (Ward's method)", fontsize="18")
230 + fig.tight_layout()
231 + fig.subplots_adjust(top=0.92)
232 + fig.colorbar(im, ax=axs.tolist(), shrink=0.98)
233 + fig.savefig(f"results/distances.png")
193 234
194 235
195 236
...@@ -205,24 +246,20 @@ if __name__ == "__main__": ...@@ -205,24 +246,20 @@ if __name__ == "__main__":
205 mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list') 246 mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
206 for k in mappings_list.keys(): 247 for k in mappings_list.keys():
207 mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ] 248 mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ]
208 - print(mappings_list) 249 +
209 - exit() 250 + # print("Loading datapoints from file...")
210 - 251 + # rna_points = []
211 - 252 + # filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
212 - print("Loading datapoints from file...") 253 + # p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
213 - rna_points = [] 254 + # pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
214 - filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f] 255 + # for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
215 - p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number()) 256 + # rna_points.append(rna)
216 - pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True) 257 + # pbar.update(1)
217 - for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)): 258 + # pbar.close()
218 - rna_points.append(rna) 259 + # p.close()
219 - pbar.update(1) 260 + # p.join()
220 - pbar.close() 261 + # npoints = len(rna_points)
221 - p.close() 262 + # print(npoints, "RNA files loaded.")
222 - p.join()
223 - npoints = len(rna_points)
224 - print(npoints, "RNA files loaded.")
225 - exit()
226 263
227 ################################################################# 264 #################################################################
228 # Define threads for the tasks 265 # Define threads for the tasks
......