Showing
2 changed files
with
63 additions
and
26 deletions
results/clusters_rot180.png
deleted
100644 → 0
![](/lbecquey/RNANetLegacy/raw/220d1a3ef35e9bafc8ee0b2fc0882b99a1a1db38/results/clusters_rot180.png)
399 KB
... | @@ -6,14 +6,17 @@ import threading as th | ... | @@ -6,14 +6,17 @@ import threading as th |
6 | import seaborn as sb | 6 | import seaborn as sb |
7 | import scipy.stats as st | 7 | import scipy.stats as st |
8 | import matplotlib.pyplot as plt | 8 | import matplotlib.pyplot as plt |
9 | -import matplotlib.patches as ptch | 9 | +import pylab |
10 | +import scipy.cluster.hierarchy as sch | ||
11 | +from scipy.spatial.distance import squareform | ||
10 | from mpl_toolkits.mplot3d import axes3d | 12 | from mpl_toolkits.mplot3d import axes3d |
11 | from Bio.Phylo.TreeConstruction import DistanceCalculator | 13 | from Bio.Phylo.TreeConstruction import DistanceCalculator |
12 | -from Bio import AlignIO | 14 | +from Bio import AlignIO, SeqIO |
13 | from matplotlib import cm | 15 | from matplotlib import cm |
14 | from tqdm import tqdm | 16 | from tqdm import tqdm |
15 | from functools import partial | 17 | from functools import partial |
16 | from multiprocessing import Pool | 18 | from multiprocessing import Pool |
19 | +from os import path | ||
17 | from RNAnet import read_cpu_number | 20 | from RNAnet import read_cpu_number |
18 | 21 | ||
19 | 22 | ||
... | @@ -182,14 +185,52 @@ def stats_len(mappings_list, points): | ... | @@ -182,14 +185,52 @@ def stats_len(mappings_list, points): |
182 | # ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14) | 185 | # ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14) |
183 | plt.savefig("results/full_length_distribs.png") | 186 | plt.savefig("results/full_length_distribs.png") |
184 | 187 | ||
185 | -def seq_idty(mappings_list): | 188 | +def to_dist_matrix(f): |
186 | - idty_matrix = {} | 189 | + print(f) |
187 | dm = DistanceCalculator('identity') | 190 | dm = DistanceCalculator('identity') |
188 | - for f in mappings_list.keys(): | 191 | + with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file: |
189 | - with open(path_to_seq_data+"realigned/"+f+"++.stk") as al_file: | 192 | + al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] |
190 | - al = AlignIO.read(al_file, "stockholm") | 193 | + idty = dm.get_distance(al).matrix # list of lists |
191 | - idty_matrix[f] = dm.get_distance(al) | 194 | + del al |
195 | + l = len(idty) | ||
196 | + np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ])) | ||
197 | + del idty | ||
198 | + return 0 | ||
192 | 199 | ||
200 | +def seq_idty(mappings_list): | ||
201 | + fam_arrays = [] | ||
202 | + for f in sorted(mappings_list.keys()): | ||
203 | + if path.isfile("data/"+f+".npy"): | ||
204 | + fam_arrays.append(np.load("data/"+f+".npy")) | ||
205 | + else: | ||
206 | + # to_dist_matrix(f) | ||
207 | + # fam_arrays.append(np.load("data/"+f+".npy")) | ||
208 | + fam_arrays.append([]) | ||
209 | + | ||
210 | + fig, axs = plt.subplots(11,7, figsize=(25,25)) | ||
211 | + axs = axs.ravel() | ||
212 | + [axi.set_axis_off() for axi in axs] | ||
213 | + for f, D, ax in zip(sorted(mappings_list.keys()), fam_arrays, axs): | ||
214 | + if not len(D): continue | ||
215 | + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize | ||
216 | + D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix | ||
217 | + condensedD = squareform(D) | ||
218 | + | ||
219 | + # Compute basic dendrogram by Ward's method | ||
220 | + Y = sch.linkage(condensedD, method='ward') | ||
221 | + Z = sch.dendrogram(Y, orientation='left', no_plot=True) | ||
222 | + | ||
223 | + # Reorganize rows and cols | ||
224 | + idx1 = Z['leaves'] | ||
225 | + D = D[idx1,:] | ||
226 | + D = D[:,idx1[::-1]] | ||
227 | + im = ax.matshow(D, vmin=0, vmax=1, origin='lower') | ||
228 | + ax.set_title(f) | ||
229 | + fig.suptitle("Distance matrices of sequences from various families\nclustered by sequence identity (Ward's method)", fontsize="18") | ||
230 | + fig.tight_layout() | ||
231 | + fig.subplots_adjust(top=0.92) | ||
232 | + fig.colorbar(im, ax=axs.tolist(), shrink=0.98) | ||
233 | + fig.savefig(f"results/distances.png") | ||
193 | 234 | ||
194 | 235 | ||
195 | 236 | ||
... | @@ -205,24 +246,20 @@ if __name__ == "__main__": | ... | @@ -205,24 +246,20 @@ if __name__ == "__main__": |
205 | mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list') | 246 | mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list') |
206 | for k in mappings_list.keys(): | 247 | for k in mappings_list.keys(): |
207 | mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ] | 248 | mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ] |
208 | - print(mappings_list) | 249 | + |
209 | - exit() | 250 | + # print("Loading datapoints from file...") |
210 | - | 251 | + # rna_points = [] |
211 | - | 252 | + # filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f] |
212 | - print("Loading datapoints from file...") | 253 | + # p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number()) |
213 | - rna_points = [] | 254 | + # pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True) |
214 | - filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f] | 255 | + # for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)): |
215 | - p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number()) | 256 | + # rna_points.append(rna) |
216 | - pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True) | 257 | + # pbar.update(1) |
217 | - for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)): | 258 | + # pbar.close() |
218 | - rna_points.append(rna) | 259 | + # p.close() |
219 | - pbar.update(1) | 260 | + # p.join() |
220 | - pbar.close() | 261 | + # npoints = len(rna_points) |
221 | - p.close() | 262 | + # print(npoints, "RNA files loaded.") |
222 | - p.join() | ||
223 | - npoints = len(rna_points) | ||
224 | - print(npoints, "RNA files loaded.") | ||
225 | - exit() | ||
226 | 263 | ||
227 | ################################################################# | 264 | ################################################################# |
228 | # Define threads for the tasks | 265 | # Define threads for the tasks | ... | ... |
-
Please register or login to post a comment