Louis BECQUEY

parallel computation of distance matrices

...@@ -3,10 +3,8 @@ import os ...@@ -3,10 +3,8 @@ import os
3 import numpy as np 3 import numpy as np
4 import pandas as pd 4 import pandas as pd
5 import threading as th 5 import threading as th
6 -import seaborn as sb
7 import scipy.stats as st 6 import scipy.stats as st
8 import matplotlib.pyplot as plt 7 import matplotlib.pyplot as plt
9 -import pylab
10 import scipy.cluster.hierarchy as sch 8 import scipy.cluster.hierarchy as sch
11 from scipy.spatial.distance import squareform 9 from scipy.spatial.distance import squareform
12 from mpl_toolkits.mplot3d import axes3d 10 from mpl_toolkits.mplot3d import axes3d
...@@ -186,6 +184,8 @@ def stats_len(mappings_list, points): ...@@ -186,6 +184,8 @@ def stats_len(mappings_list, points):
186 plt.savefig("results/full_length_distribs.png") 184 plt.savefig("results/full_length_distribs.png")
187 185
188 def to_dist_matrix(f): 186 def to_dist_matrix(f):
187 + if path.isfile("data/"+f+".npy"):
188 + return 0
189 print(f) 189 print(f)
190 dm = DistanceCalculator('identity') 190 dm = DistanceCalculator('identity')
191 with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file: 191 with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file:
...@@ -198,13 +198,21 @@ def to_dist_matrix(f): ...@@ -198,13 +198,21 @@ def to_dist_matrix(f):
198 return 0 198 return 0
199 199
200 def seq_idty(mappings_list): 200 def seq_idty(mappings_list):
201 + # compute distance matrices
202 + p = Pool(processes=8)
203 + pbar = tqdm(total=len(mappings_list.keys()), desc="RNA families", position=0, leave=True)
204 + for i, _ in enumerate(p.imap_unordered(to_dist_matrix, sorted(mappings_list.keys()))):
205 + pbar.update(1)
206 + pbar.close()
207 + p.close()
208 + p.join()
209 +
210 + # load them
201 fam_arrays = [] 211 fam_arrays = []
202 for f in sorted(mappings_list.keys()): 212 for f in sorted(mappings_list.keys()):
203 if path.isfile("data/"+f+".npy"): 213 if path.isfile("data/"+f+".npy"):
204 fam_arrays.append(np.load("data/"+f+".npy")) 214 fam_arrays.append(np.load("data/"+f+".npy"))
205 else: 215 else:
206 - # to_dist_matrix(f)
207 - # fam_arrays.append(np.load("data/"+f+".npy"))
208 fam_arrays.append([]) 216 fam_arrays.append([])
209 217
210 fig, axs = plt.subplots(11,7, figsize=(25,25)) 218 fig, axs = plt.subplots(11,7, figsize=(25,25))
...@@ -289,4 +297,4 @@ if __name__ == "__main__": ...@@ -289,4 +297,4 @@ if __name__ == "__main__":
289 seq_idty(mappings_list) 297 seq_idty(mappings_list)
290 # stats_len(mappings_list, rna_points) 298 # stats_len(mappings_list, rna_points)
291 299
292 -
...\ No newline at end of file ...\ No newline at end of file
300 +
......