Nathalie BERNARD

script python pour faire des analyses sur le nombre de motifs dans la librairie,…

… le nombre de motifs insérés et le nombre de solutions prédites
1 +import json
2 +import numpy as np
3 +import matplotlib.pyplot as plt
4 +import os.path
5 +
6 +# Creates a violin plot of the distribution of the number of 'motifs' in the Isaure pattern library
7 +def stats_library():
8 + with open('/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/motifs_28-05-2021.json') as f:
9 + data = json.load(f)
10 +
11 + nb_motifs = json.dumps(data).count("sequence")
12 + print("nombre de motifs: " + str(nb_motifs))
13 +
14 + tab_seq_length = []
15 + tab_seq_length_with_delimiter = []
16 + for i in range(1007):
17 + test = str(i) in data
18 + if test:
19 + sequence = data[str(i)]["sequence"]
20 + count_delimiter = sequence.count('&')
21 + tab_seq_length.append(len(sequence) - count_delimiter)
22 + tab_seq_length_with_delimiter.append(len(sequence))
23 +
24 + data_to_plot = [np.array(tab_seq_length), np.array(tab_seq_length_with_delimiter)]
25 + min1 = np.amin(data_to_plot[0])
26 + max1 = np.amax(data_to_plot[0])
27 + median1 = np.median(data_to_plot[0])
28 +
29 + min2 = np.amin(data_to_plot[1])
30 + max2 = np.amax(data_to_plot[1])
31 + median2 = np.median(data_to_plot[1])
32 + fig = plt.figure()
33 +
34 + ax = fig.add_axes([0, 0, 1, 1])
35 +
36 + label1 = "nombre de nucléotides" + "\n minimum: " + str(min1) + " mediane: " + str(
37 + median1) + " maximum: " + str(max1)
38 + label2 = "nombre de nucléotides + nombre de &" + "\n minimum: " + str(min2) + " mediane: " + str(
39 + median2) + " maximum: " + str(max2)
40 + labels = [label1, label2]
41 + ax.set_xticks(np.arange(1, len(labels) + 1))
42 + ax.set_xticklabels(labels)
43 + ax.set_ylabel('longueurs des motifs')
44 + ax.set_xlabel('motifs')
45 +
46 + violins = ax.violinplot(data_to_plot, showmedians=True)
47 +
48 + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
49 + vp = violins[partname]
50 + vp.set_edgecolor('black')
51 + vp.set_linewidth(1)
52 +
53 + for v in violins['bodies']:
54 + v.set_facecolor('red')
55 + plt.title("Répartition des longueurs des motifs dans la bibliothèque (" + str(nb_motifs) + " motifs)")
56 + plt.savefig("statistiques_motifs_Isaure.png", bbox_inches='tight')
57 +
58 +# Returns the list in half
59 +def get_half(list_name):
60 + first_half = []
61 + second_half = []
62 + if len(list_name) % 2 == 0:
63 + middle = len(list_name) / 2
64 + else:
65 + middle = len(list_name) / 2 + 0.5
66 +
67 + for i in range(int(middle)):
68 + first_half.append(list_name[i])
69 +
70 + for i in range(int(middle)):
71 + if i + int(middle) < len(list_name):
72 + second_half.append(list_name[i + int(middle)])
73 +
74 + return [first_half, second_half]
75 +
76 +# Returns the list of name of the sequence in the benchmark.dbn
77 +def get_list_name_bm():
78 + path_file = '/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/benchmark.dbn'
79 + my_file = open(path_file, "r")
80 + list_name = []
81 + count = 0
82 + for line in my_file:
83 + if count % 4 == 0:
84 + list_name.append(line[5:len(line) - 2])
85 + count = count + 1
86 + my_file.close()
87 +
88 + return list_name
89 +
90 +# Returns a 2d array containing for each sequence of the benchmark the number of 'motifs' inserted of each solution
91 +def get_nb_motifs_by_seq(type_file):
92 + list_name = get_list_name_bm()
93 + tab = []
94 + for name in list_name:
95 + path_file = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_" + name + type_file
96 + if os.path.exists(path_file):
97 + list_nb_motifs = []
98 + my_file = open(path_file, "r")
99 + name = my_file.readline()
100 + seq = my_file.readline()
101 + count = 0
102 + for line in my_file:
103 + if count % 2 == 0:
104 + tab_split = line.split("+")
105 + list_nb_motifs.append(len(tab_split) - 1)
106 + count = count + 1
107 + my_file.close()
108 + tab.append(list_nb_motifs)
109 + return tab
110 +
111 +# Creates a violin plot that shows the distribution of the number of patterns per solution for each sequence of the benchmark
112 +def stats_nb_motifs_in_result(type_file):
113 + list_name = get_list_name_bm()
114 + tab = get_nb_motifs_by_seq(type_file)
115 +
116 + list_median_str = []
117 + for i in range(len(tab)):
118 + list_median_str.append(np.median(tab[i]))
119 +
120 + tab = [x for _, x in sorted(zip(list_median_str, tab))]
121 + list_name = [x for _, x in sorted(zip(list_median_str, list_name))]
122 +
123 + if (len(tab) % 2 == 0):
124 + absciss = len(tab) / 2
125 + else:
126 + absciss = len(tab) / 2 + 0.5
127 +
128 + divide_name = get_half(list_name)
129 + divide_tab = get_half(tab)
130 +
131 + arr = np.array(tab)
132 + fig, ax = plt.subplots()
133 + plt.figure(figsize=(15, 4), dpi=200)
134 + plt.xticks(rotation=90)
135 + violins = plt.violinplot(divide_tab[0], showmedians=True)
136 + for i in range(int(len(divide_tab[0]))):
137 + y = divide_tab[0][i]
138 + x = np.random.normal(1 + i, 0.04, size=len(y))
139 + plt.scatter(x, y)
140 + plt.xticks(np.arange(1, len(divide_tab[0]) + 1), divide_name[0])
141 +
142 + plt.xlabel('nom de la séquence')
143 + plt.ylabel('nombre de motifs insérés dans la structure prédite')
144 + plt.title("Répartition du nombre de motifs insérés par résultat pour chaque séquence")
145 +
146 + for part in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
147 + vp = violins[part]
148 + vp.set_edgecolor('black')
149 + vp.set_linewidth(1)
150 +
151 + for v in violins['bodies']:
152 + v.set_facecolor('black')
153 +
154 + plt.savefig("statistiques_nb_motifs_inseres_Isaure_" + type_file[8:] + "_1.png", bbox_inches='tight')
155 +
156 + plt.figure(figsize=(15, 4), dpi=200)
157 + plt.xticks(rotation=90)
158 + violins = plt.violinplot(divide_tab[1], showmedians=True)
159 + for i in range(int(len(divide_tab[1]))):
160 + y = divide_tab[1][i]
161 + x = np.random.normal(1 + i, 0.04, size=len(y))
162 + plt.scatter(x, y)
163 + plt.xticks(np.arange(1, len(divide_tab[1]) + 1), divide_name[1])
164 +
165 + plt.xlabel('nom de la séquence')
166 + plt.ylabel('nombre de motifs insérés dans la structure prédite')
167 + plt.title("Répartition du nombre de motifs insérés par résultat pour chaque séquence (2ème partie)")
168 +
169 + for part in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
170 + vp = violins[part]
171 + vp.set_edgecolor('black')
172 + vp.set_linewidth(1)
173 +
174 + for v in violins['bodies']:
175 + v.set_facecolor('black')
176 + plt.savefig("statistiques_nb_motifs_inseres_Isaure_" + type_file[8:] + "_2.png", bbox_inches='tight')
177 +
178 +# Returns the grouping of the number of inserted 'motif' for all solutions of all sequences of the benchmark
179 +# according to the extension in argument
180 +def get_all_nb_motifs_by_type(type_file):
181 + tab = get_nb_motifs_by_seq(type_file)
182 + tab_all = []
183 + for i in range(len(tab)):
184 + for j in range(len(tab[i])):
185 + tab_all.append(tab[i][j])
186 + return tab_all
187 +
188 +# Create a figure containing the violin plot for MEA + function E, MEA + function F, MFE + function E and MFE + function F
189 +# Each violin plot show the distribution of the number of inserted 'motif' by solution
190 +def stats_nb_motifs_all():
191 + list_name = get_list_name_bm()
192 + tab_all_E_MEA = get_all_nb_motifs_by_type(".json_pmE_MEA")
193 + tab_all_F_MEA = get_all_nb_motifs_by_type(".json_pmF_MEA")
194 + tab_all_E_MFE = get_all_nb_motifs_by_type(".json_pmE_MFE")
195 + tab_all_F_MFE = get_all_nb_motifs_by_type(".json_pmF_MFE")
196 +
197 + data_to_plot = [tab_all_E_MEA, tab_all_F_MEA, tab_all_E_MFE, tab_all_F_MFE]
198 + fig = plt.figure()
199 + fig.set_size_inches(6, 3)
200 + ax = fig.add_axes([0, 0, 1, 1])
201 +
202 + labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F']
203 + ax.set_xticks(np.arange(1, len(labels) + 1))
204 + ax.set_xticklabels(labels)
205 + ax.set_xlabel("Répartition du nombre de motifs insérés par résultat pour chaque séquence")
206 + ax.set_ylabel("nombre de motifs insérés dans la structure prédite")
207 +
208 + violins = ax.violinplot(data_to_plot, showmedians=True)
209 +
210 + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
211 + vp = violins[partname]
212 + vp.set_edgecolor('black')
213 + vp.set_linewidth(1)
214 +
215 + for v in violins['bodies']:
216 + v.set_facecolor("black")
217 + plt.savefig('repartition_nb_motifs.png', dpi=200, bbox_inches='tight')
218 +
219 +# Returns the list of the number of solutions and the list of the length of each sequence of the benchmark
220 +def get_nb_solutions_and_sizes_by_seq(type_file):
221 + list_name = get_list_name_bm()
222 + list_nb_solutions = []
223 + list_size = []
224 + for name in list_name:
225 + path_file = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_" + name + type_file
226 + if os.path.exists(path_file):
227 + my_file = open(path_file, "r")
228 + name = my_file.readline()
229 + seq = my_file.readline()
230 + list_size.append(len(seq))
231 + count = 0
232 + nb = 0
233 + for line in my_file:
234 + if count % 2 == 0:
235 + nb = nb + 1
236 + count = count + 1
237 + list_nb_solutions.append(nb)
238 + my_file.close()
239 + return [list_nb_solutions, list_size]
240 +
241 +# Creates 4 violin plots that shows the distribution of the number of solutions for each sequence of the benchmark
242 +def stats_nb_solutions():
243 + list_name = get_list_name_bm()
244 + tab_all_E_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmE_MEA")[0]
245 + tab_all_F_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmF_MEA")[0]
246 + tab_all_E_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmE_MFE")[0]
247 + tab_all_F_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmF_MFE")[0]
248 + data_to_plot = [tab_all_E_MEA, tab_all_F_MEA, tab_all_E_MFE, tab_all_F_MFE]
249 +
250 + all_data = []
251 + for i in range(len(data_to_plot)):
252 + for j in range(len(data_to_plot[i])):
253 + all_data.append(data_to_plot[i][j])
254 +
255 + min = np.amin(all_data)
256 + max = np.amax(all_data)
257 + median = np.median(all_data)
258 +
259 + fig = plt.figure()
260 + fig.set_size_inches(6, 3)
261 + ax = fig.add_axes([0, 0, 1, 1])
262 +
263 + labels = ['MEA + E', 'MEA + F', 'MFE + E', 'MFE + F']
264 + ax.set_xticks(np.arange(1, len(labels) + 1))
265 + ax.set_xticklabels(labels)
266 + ax.set_xlabel("Répartition du nombre solutions pour chaque séquence du benchmark" + "\n minimum: " + str(min) + " mediane: " + str(
267 + median) + " maximum: " + str(max) + " (Pour l'ensemble des solutions)")
268 + ax.set_ylabel("nombre de solutions (structures secondaires prédites)")
269 +
270 + violins = ax.violinplot(data_to_plot, showmedians=True)
271 +
272 + for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians'):
273 + vp = violins[partname]
274 + vp.set_edgecolor('blue')
275 + vp.set_linewidth(1)
276 +
277 + for v in violins['bodies']:
278 + v.set_facecolor("blue")
279 + plt.savefig('repartition_nb_solutions.png', dpi=200, bbox_inches='tight')
280 +
281 +# Create a scatter plot showing the number of solutions according to the length of the sequence in the benchmark
282 +def stats_nb_solutions_by_seq_length():
283 + list_name = get_list_name_bm()
284 + x = []
285 + y = []
286 + tab_all_E_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmE_MEA")
287 + for i in range(len(tab_all_E_MEA[0])):
288 + x.append(tab_all_E_MEA[1][i])
289 + y.append(tab_all_E_MEA[0][i])
290 + tab_all_F_MEA = get_nb_solutions_and_sizes_by_seq(".json_pmF_MEA")
291 + for i in range(len(tab_all_F_MEA[0])):
292 + x.append(tab_all_F_MEA[1][i])
293 + y.append(tab_all_F_MEA[0][i])
294 + tab_all_E_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmE_MFE")
295 + for i in range(len(tab_all_E_MFE[0])):
296 + x.append(tab_all_E_MFE[1][i])
297 + y.append(tab_all_E_MFE[0][i])
298 + tab_all_F_MFE = get_nb_solutions_and_sizes_by_seq(".json_pmF_MFE")
299 + for i in range(len(tab_all_F_MFE[0])):
300 + x.append(tab_all_F_MFE[1][i])
301 + y.append(tab_all_F_MFE[0][i])
302 +
303 + plt.scatter(x, y, s=50, c='blue', marker='o', edgecolors='black')
304 + plt.ylabel("nombre de solutions")
305 + plt.xlabel("longueur de la séquence")
306 + plt.title('nombre de structures prédites en fonction de la longueur de la séquence')
307 + plt.savefig('nb_solutions_en_fonction_seq.png', dpi=200, bbox_inches='tight')
308 +
309 +"""stats_nb_motifs_in_result(".json_pmE_MEA")
310 +stats_nb_motifs_in_result(".json_pmF_MEA")
311 +stats_nb_motifs_in_result(".json_pmE_MFE")
312 +stats_nb_motifs_in_result(".json_pmF_MFE")
313 +stats_nb_motifs_all()
314 +stats_nb_solutions()"""
315 +stats_nb_solutions_by_seq_length()