script python pour faire des analyses sur le nombre de motifs dans la librairie,…
… le nombre de motifs insérés et le nombre de solutions prédites
Showing
1 changed file
with
315 additions
and
0 deletions
scripts/stats.py
0 → 100644
1 | +import json | ||
2 | +import numpy as np | ||
3 | +import matplotlib.pyplot as plt | ||
4 | +import os.path | ||
5 | + | ||
6 | +# Creates a violin plot of the distribution of the number of 'motifs' in the Isaure pattern library | ||
7 | +def stats_library(): | ||
8 | + with open('/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_28-05-2021.json') as f: | ||
9 | + data = json.load(f) | ||
10 | + | ||
11 | + nb_motifs = json.dumps(data).count("sequence") | ||
12 | + print("nombre de motifs: " + str(nb_motifs)) | ||
13 | + | ||
14 | + tab_seq_length = [] | ||
15 | + tab_seq_length_with_delimiter = [] | ||
16 | + for i in range(1007): | ||
17 | + test = str(i) in data | ||
18 | + if test: | ||
19 | + sequence = data[str(i)]["sequence"] | ||
20 | + count_delimiter = sequence.count('&') | ||
21 | + tab_seq_length.append(len(sequence) - count_delimiter) | ||
22 | + tab_seq_length_with_delimiter.append(len(sequence)) | ||
23 | + | ||
24 | + data_to_plot = [np.array(tab_seq_length), np.array(tab_seq_length_with_delimiter)] | ||
25 | + min1 = np.amin(data_to_plot[0]) | ||
26 | + max1 = np.amax(data_to_plot[0]) | ||
27 | + median1 = np.median(data_to_plot[0]) | ||
28 | + | ||
29 | + min2 = np.amin(data_to_plot[1]) | ||
30 | + max2 = np.amax(data_to_plot[1]) | ||
31 | + median2 = np.median(data_to_plot[1]) | ||
32 | + fig = plt.figure() | ||
33 | + | ||
34 | + ax = fig.add_axes([0, 0, 1, 1]) | ||
35 | + | ||
36 | + label1 = "nombre de nucléotides" + "\n minimum: " + str(min1) + " mediane: " + str( | ||
37 | + median1) + " maximum: " + str(max1) | ||
38 | + label2 = "nombre de nucléotides + nombre de &" + "\n minimum: " + str(min2) + " mediane: " + str( | ||
39 | + median2) + " maximum: " + str(max2) | ||
40 | + labels = [label1, label2] | ||
41 | + ax.set_xticks(np.arange(1, len(labels) + 1)) | ||
42 | + ax.set_xticklabels(labels) | ||
43 | + ax.set_ylabel('longueurs des motifs') | ||
44 | + ax.set_xlabel('motifs') | ||
45 | + | ||
46 | + violins = ax.violinplot(data_to_plot, showmedians=True) | ||
47 | + | ||
48 | + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'): | ||
49 | + vp = violins[partname] | ||
50 | + vp.set_edgecolor('black') | ||
51 | + vp.set_linewidth(1) | ||
52 | + | ||
53 | + for v in violins['bodies']: | ||
54 | + v.set_facecolor('red') | ||
55 | + plt.title("Répartition des longueurs des motifs dans la bibliothèque (" + str(nb_motifs) + " motifs)") | ||
56 | + plt.savefig("statistiques_motifs_Isaure.png", bbox_inches='tight') | ||
57 | + | ||
58 | +# Returns the list in half | ||
59 | +def get_half(list_name): | ||
60 | + first_half = [] | ||
61 | + second_half = [] | ||
62 | + if len(list_name) % 2 == 0: | ||
63 | + middle = len(list_name) / 2 | ||
64 | + else: | ||
65 | + middle = len(list_name) / 2 + 0.5 | ||
66 | + | ||
67 | + for i in range(int(middle)): | ||
68 | + first_half.append(list_name[i]) | ||
69 | + | ||
70 | + for i in range(int(middle)): | ||
71 | + if i + int(middle) < len(list_name): | ||
72 | + second_half.append(list_name[i + int(middle)]) | ||
73 | + | ||
74 | + return [first_half, second_half] | ||
75 | + | ||
76 | +# Returns the list of name of the sequence in the benchmark.dbn | ||
77 | +def get_list_name_bm(): | ||
78 | + path_file = '/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/benchmark.dbn' | ||
79 | + my_file = open(path_file, "r") | ||
80 | + list_name = [] | ||
81 | + count = 0 | ||
82 | + for line in my_file: | ||
83 | + if count % 4 == 0: | ||
84 | + list_name.append(line[5:len(line) - 2]) | ||
85 | + count = count + 1 | ||
86 | + my_file.close() | ||
87 | + | ||
88 | + return list_name | ||
89 | + | ||
90 | +# Returns a 2d array containing for each sequence of the benchmark the number of 'motifs' inserted of each solution | ||
91 | +def get_nb_motifs_by_seq(type_file): | ||
92 | + list_name = get_list_name_bm() | ||
93 | + tab = [] | ||
94 | + for name in list_name: | ||
95 | + path_file = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_" + name + type_file | ||
96 | + if os.path.exists(path_file): | ||
97 | + list_nb_motifs = [] | ||
98 | + my_file = open(path_file, "r") | ||
99 | + name = my_file.readline() | ||
100 | + seq = my_file.readline() | ||
101 | + count = 0 | ||
102 | + for line in my_file: | ||
103 | + if count % 2 == 0: | ||
104 | + tab_split = line.split("+") | ||
105 | + list_nb_motifs.append(len(tab_split) - 1) | ||
106 | + count = count + 1 | ||
107 | + my_file.close() | ||
108 | + tab.append(list_nb_motifs) | ||
109 | + return tab | ||
110 | + | ||
111 | +# Creates a violin plot that shows the distribution of the number of patterns per solution for each sequence of the benchmark | ||
112 | +def stats_nb_motifs_in_result(type_file): | ||
113 | + list_name = get_list_name_bm() | ||
114 | + tab = get_nb_motifs_by_seq(type_file) | ||
115 | + | ||
116 | + list_median_str = [] | ||
117 | + for i in range(len(tab)): | ||
118 | + list_median_str.append(np.median(tab[i])) | ||
119 | + | ||
120 | + tab = [x for _, x in sorted(zip(list_median_str, tab))] | ||
121 | + list_name = [x for _, x in sorted(zip(list_median_str, list_name))] | ||
122 | + | ||
123 | + if (len(tab) % 2 == 0): | ||
124 | + absciss = len(tab) / 2 | ||
125 | + else: | ||
126 | + absciss = len(tab) / 2 + 0.5 | ||
127 | + | ||
128 | + divide_name = get_half(list_name) | ||
129 | + divide_tab = get_half(tab) | ||
130 | + | ||
131 | + arr = np.array(tab) | ||
132 | + fig, ax = plt.subplots() | ||
133 | + plt.figure(figsize=(15, 4), dpi=200) | ||
134 | + plt.xticks(rotation=90) | ||
135 | + violins = plt.violinplot(divide_tab[0], showmedians=True) | ||
136 | + for i in range(int(len(divide_tab[0]))): | ||
137 | + y = divide_tab[0][i] | ||
138 | + x = np.random.normal(1 + i, 0.04, size=len(y)) | ||
139 | + plt.scatter(x, y) | ||
140 | + plt.xticks(np.arange(1, len(divide_tab[0]) + 1), divide_name[0]) | ||
141 | + | ||
142 | + plt.xlabel('nom de la séquence') | ||
143 | + plt.ylabel('nombre de motifs insérés dans la structure prédite') | ||
144 | + plt.title("Répartition du nombre de motifs insérés par résultat pour chaque séquence") | ||
145 | + | ||
146 | + for part in ('cbars', 'cmins', 'cmaxes', 'cmedians'): | ||
147 | + vp = violins[part] | ||
148 | + vp.set_edgecolor('black') | ||
149 | + vp.set_linewidth(1) | ||
150 | + | ||
151 | + for v in violins['bodies']: | ||
152 | + v.set_facecolor('black') | ||
153 | + | ||
154 | + plt.savefig("statistiques_nb_motifs_inseres_Isaure_" + type_file[8:] + "_1.png", bbox_inches='tight') | ||
155 | + | ||
156 | + plt.figure(figsize=(15, 4), dpi=200) | ||
157 | + plt.xticks(rotation=90) | ||
158 | + violins = plt.violinplot(divide_tab[1], showmedians=True) | ||
159 | + for i in range(int(len(divide_tab[1]))): | ||
160 | + y = divide_tab[1][i] | ||
161 | + x = np.random.normal(1 + i, 0.04, size=len(y)) | ||
162 | + plt.scatter(x, y) | ||
163 | + plt.xticks(np.arange(1, len(divide_tab[1]) + 1), divide_name[1]) | ||
164 | + | ||
165 | + plt.xlabel('nom de la séquence') | ||
166 | + plt.ylabel('nombre de motifs insérés dans la structure prédite') | ||
167 | + plt.title("Répartition du nombre de motifs insérés par résultat pour chaque séquence (2ème partie)") | ||
168 | + | ||
169 | + for part in ('cbars', 'cmins', 'cmaxes', 'cmedians'): | ||
170 | + vp = violins[part] | ||
171 | + vp.set_edgecolor('black') | ||
172 | + vp.set_linewidth(1) | ||
173 | + | ||
174 | + for v in violins['bodies']: | ||
175 | + v.set_facecolor('black') | ||
176 | + plt.savefig("statistiques_nb_motifs_inseres_Isaure_" + type_file[8:] + "_2.png", bbox_inches='tight') | ||
177 | + | ||
178 | +# Returns the grouping of the number of inserted 'motif' for all solutions of all sequences of the benchmark | ||
179 | +# according to the extension in argument | ||
180 | +def get_all_nb_motifs_by_type(type_file): | ||
181 | + tab = get_nb_motifs_by_seq(type_file) | ||
182 | + tab_all = [] | ||
183 | + for i in range(len(tab)): | ||
184 | + for j in range(len(tab[i])): | ||
185 | + tab_all.append(tab[i][j]) | ||
186 | + return tab_all | ||
187 | + | ||
188 | +# Create a figure containing the violin plot for MEA + function E, MEA + function F, MFE + function E and MFE + function F | ||
189 | +# Each violin plot show the distribution of the number of inserted 'motif' by solution | ||
190 | +def stats_nb_motifs_all(): | ||
191 | + list_name = get_list_name_bm() | ||
192 | + tab_all_E_MEA = get_all_nb_motifs_by_type(".json_pmE_MEA") | ||
193 | + tab_all_F_MEA = get_all_nb_motifs_by_type(".json_pmF_MEA") | ||
194 | + tab_all_E_MFE = get_all_nb_motifs_by_type(".json_pmE_MFE") | ||
195 | + tab_all_F_MFE = get_all_nb_motifs_by_type(".json_pmF_MFE") | ||
196 | + | ||
197 | + data_to_plot = [tab_all_E_MEA, tab_all_F_MEA, tab_all_E_MFE, tab_all_F_MFE] | ||
198 | + fig = plt.figure() | ||
199 | + fig.set_size_inches(6, 3) | ||
200 | + ax = fig.add_axes([0, 0, 1, 1]) | ||
201 | + | ||
202 | + labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F'] | ||
203 | + ax.set_xticks(np.arange(1, len(labels) + 1)) | ||
204 | + ax.set_xticklabels(labels) | ||
205 | + ax.set_xlabel("Répartition du nombre de motifs insérés par résultat pour chaque séquence") | ||
206 | + ax.set_ylabel("nombre de motifs insérés dans la structure prédite") | ||
207 | + | ||
208 | + violins = ax.violinplot(data_to_plot, showmedians=True) | ||
209 | + | ||
210 | + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'): | ||
211 | + vp = violins[partname] | ||
212 | + vp.set_edgecolor('black') | ||
213 | + vp.set_linewidth(1) | ||
214 | + | ||
215 | + for v in violins['bodies']: | ||
216 | + v.set_facecolor("black") | ||
217 | + plt.savefig('repartition_nb_motifs.png', dpi=200, bbox_inches='tight') | ||
218 | + | ||
219 | +# Returns the list of the number of solutions and the list of the length of each sequence of the benchmark | ||
220 | +def get_nb_solutions_and_sizes_by_seq(type_file): | ||
221 | + list_name = get_list_name_bm() | ||
222 | + list_nb_solutions = [] | ||
223 | + list_size = [] | ||
224 | + for name in list_name: | ||
225 | + path_file = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_" + name + type_file | ||
226 | + if os.path.exists(path_file): | ||
227 | + my_file = open(path_file, "r") | ||
228 | + name = my_file.readline() | ||
229 | + seq = my_file.readline() | ||
230 | + list_size.append(len(seq)) | ||
231 | + count = 0 | ||
232 | + nb = 0 | ||
233 | + for line in my_file: | ||
234 | + if count % 2 == 0: | ||
235 | + nb = nb + 1 | ||
236 | + count = count + 1 | ||
237 | + list_nb_solutions.append(nb) | ||
238 | + my_file.close() | ||
239 | + return [list_nb_solutions, list_size] | ||
240 | + | ||
241 | +# Creates 4 violin plots that shows the distribution of the number of solutions for each sequence of the benchmark | ||
242 | +def stats_nb_solutions(): | ||
243 | + list_name = get_list_name_bm() | ||
244 | + tab_all_E_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmE_MEA")[0] | ||
245 | + tab_all_F_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmF_MEA")[0] | ||
246 | + tab_all_E_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmE_MFE")[0] | ||
247 | + tab_all_F_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmF_MFE")[0] | ||
248 | + data_to_plot = [tab_all_E_MEA, tab_all_F_MEA, tab_all_E_MFE, tab_all_F_MFE] | ||
249 | + | ||
250 | + all_data = [] | ||
251 | + for i in range(len(data_to_plot)): | ||
252 | + for j in range(len(data_to_plot[i])): | ||
253 | + all_data.append(data_to_plot[i][j]) | ||
254 | + | ||
255 | + min = np.amin(all_data) | ||
256 | + max = np.amax(all_data) | ||
257 | + median = np.median(all_data) | ||
258 | + | ||
259 | + fig = plt.figure() | ||
260 | + fig.set_size_inches(6, 3) | ||
261 | + ax = fig.add_axes([0, 0, 1, 1]) | ||
262 | + | ||
263 | + labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F'] | ||
264 | + ax.set_xticks(np.arange(1, len(labels) + 1)) | ||
265 | + ax.set_xticklabels(labels) | ||
266 | + ax.set_xlabel("Répartition du nombre solutions pour chaque séquence du benchmark" + "\n minimum: " + str(min) + " mediane: " + str( | ||
267 | + median) + " maximum: " + str(max) + " (Pour l'ensemble des solutions)") | ||
268 | + ax.set_ylabel("nombre de solutions (structures secondaires prédites)") | ||
269 | + | ||
270 | + violins = ax.violinplot(data_to_plot, showmedians=True) | ||
271 | + | ||
272 | + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'): | ||
273 | + vp = violins[partname] | ||
274 | + vp.set_edgecolor('blue') | ||
275 | + vp.set_linewidth(1) | ||
276 | + | ||
277 | + for v in violins['bodies']: | ||
278 | + v.set_facecolor("blue") | ||
279 | + plt.savefig('repartition_nb_solutions.png', dpi=200, bbox_inches='tight') | ||
280 | + | ||
281 | +# Create a scatter plot showing the number of solutions according to the length of the sequence in the benchmark | ||
282 | +def stats_nb_solutions_by_seq_length(): | ||
283 | + list_name = get_list_name_bm() | ||
284 | + x = [] | ||
285 | + y = [] | ||
286 | + tab_all_E_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmE_MEA") | ||
287 | + for i in range(len(tab_all_E_MEA[0])): | ||
288 | + x.append(tab_all_E_MEA[1][i]) | ||
289 | + y.append(tab_all_E_MEA[0][i]) | ||
290 | + tab_all_F_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmF_MEA") | ||
291 | + for i in range(len(tab_all_F_MEA[0])): | ||
292 | + x.append(tab_all_F_MEA[1][i]) | ||
293 | + y.append(tab_all_F_MEA[0][i]) | ||
294 | + tab_all_E_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmE_MFE") | ||
295 | + for i in range(len(tab_all_E_MFE[0])): | ||
296 | + x.append(tab_all_E_MFE[1][i]) | ||
297 | + y.append(tab_all_E_MFE[0][i]) | ||
298 | + tab_all_F_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmF_MFE") | ||
299 | + for i in range(len(tab_all_F_MFE[0])): | ||
300 | + x.append(tab_all_F_MFE[1][i]) | ||
301 | + y.append(tab_all_F_MFE[0][i]) | ||
302 | + | ||
303 | + plt.scatter(x, y, s=50, c='blue', marker='o', edgecolors='black') | ||
304 | + plt.ylabel("nombre de solutions") | ||
305 | + plt.xlabel("longueur de la séquence") | ||
306 | + plt.title('nombre de structures prédites en fonction de la longueur de la séquence') | ||
307 | + plt.savefig('nb_solutions_en_fonction_seq.png', dpi=200, bbox_inches='tight') | ||
308 | + | ||
309 | +"""stats_nb_motifs_in_result(".json_pmE_MEA") | ||
310 | +stats_nb_motifs_in_result(".json_pmF_MEA") | ||
311 | +stats_nb_motifs_in_result(".json_pmE_MFE") | ||
312 | +stats_nb_motifs_in_result(".json_pmF_MFE") | ||
313 | +stats_nb_motifs_all() | ||
314 | +stats_nb_solutions()""" | ||
315 | +stats_nb_solutions_by_seq_length() |
-
Please register or login to post a comment