Nathalie BERNARD

Ajout de fonctions pour faire un boxplot pour les MCC de chaque séquence du benchmark

...@@ -5,6 +5,8 @@ import os.path ...@@ -5,6 +5,8 @@ import os.path
5 from math import sqrt, ceil 5 from math import sqrt, ceil
6 import numpy as np 6 import numpy as np
7 import matplotlib.pyplot as plt 7 import matplotlib.pyplot as plt
8 +import seaborn as sns
9 +import pandas as pd
8 10
9 11
10 log_path = "test.log" 12 log_path = "test.log"
...@@ -22,7 +24,20 @@ def run_test(cmd, log): ...@@ -22,7 +24,20 @@ def run_test(cmd, log):
22 log.flush() 24 log.flush()
23 rc = process.poll() 25 rc = process.poll()
24 26
25 -def create_command(name): 27 +def create_command_E(name):
28 + #cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
29 + cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
30 + "/local/local/BiorseoNath/data/fasta/" +
31 + name + ".fa " +
32 + "-O results/ " +
33 + "--contacts " +
34 + "--patternmatch " +
35 + "--func E --MFE -v " +
36 + "--biorseo-dir /local/local/BiorseoNath " +
37 + "--modules-path /local/local/BiorseoNath/data/modules/ISAURE/Motifs_derniere_version ")
38 + return cmd
39 +
40 +def create_command_F(name):
26 #cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " + 41 #cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
27 cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " + 42 cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
28 "/local/local/BiorseoNath/data/fasta/" + 43 "/local/local/BiorseoNath/data/fasta/" +
...@@ -129,8 +144,8 @@ def specificity(tp, tn, fp, fn): ...@@ -129,8 +144,8 @@ def specificity(tp, tn, fp, fn):
129 # ================== Code from Louis Beckey Benchark.py ============================== 144 # ================== Code from Louis Beckey Benchark.py ==============================
130 145
131 def write_mcc_in_file_E(sequence_id, true_contacts, true_structure): 146 def write_mcc_in_file_E(sequence_id, true_contacts, true_structure):
132 - read_prd = open("results/test_" + sequence_id + ".json_pmE_MEA", "r") 147 + read_prd = open("results/test_" + sequence_id + ".json_pmE_MFE", "r")
133 - write = open("results/test_" + sequence_id + ".mcc_E_MEA", "w") 148 + write = open("results/test_" + sequence_id + ".mcc_E_MFE", "w")
134 149
135 max_mcc_str = -1; 150 max_mcc_str = -1;
136 max_mcc_ctc = -1; 151 max_mcc_ctc = -1;
...@@ -223,12 +238,16 @@ def set_axis_style(ax, labels): ...@@ -223,12 +238,16 @@ def set_axis_style(ax, labels):
223 ax.set_xlim(0.25, len(labels) + 0.75) 238 ax.set_xlim(0.25, len(labels) + 0.75)
224 ax.set_xlabel('Sample name') 239 ax.set_xlabel('Sample name')
225 240
226 -def visualization(list_struct2d, list_contacts, function, color, lines_color): 241 +def visualization_best_mcc(list_struct2d, list_contacts, function, color, lines_color):
227 242
228 np_struct2d = np.array(list_struct2d) 243 np_struct2d = np.array(list_struct2d)
229 np_contacts = np.array(list_contacts) 244 np_contacts = np.array(list_contacts)
230 245
231 data_to_plot = [np_struct2d, np_contacts] 246 data_to_plot = [np_struct2d, np_contacts]
247 + median_2d = np.median(np_struct2d)
248 + median_ctc = np.median(np_contacts)
249 + print("mediane 2D: " + str(median_2d) + "\n")
250 + print("mediane ctc: " + str(median_ctc) + "\n")
232 251
233 fig = plt.figure() 252 fig = plt.figure()
234 253
...@@ -249,14 +268,111 @@ def visualization(list_struct2d, list_contacts, function, color, lines_color): ...@@ -249,14 +268,111 @@ def visualization(list_struct2d, list_contacts, function, color, lines_color):
249 268
250 for v in violins['bodies']: 269 for v in violins['bodies']:
251 v.set_facecolor(color) 270 v.set_facecolor(color)
252 - plt.savefig('visualisation' + function + '.png', bbox_inches='tight') 271 + plt.savefig('visualisation_16_06_MFE_' + function + '.png', bbox_inches='tight')
272 +
273 +def get_list_structs_contacts(path_benchmark, estimator, function):
274 + myfile = open(path_benchmark, "r")
275 + list_name = []
276 +
277 + complete_list_struct2d_F = []
278 + complete_list_contacts_F = []
279 +
280 + name = myfile.readline()
281 + contacts = myfile.readline()
282 + seq = myfile.readline()
283 + structure2d = myfile.readline()
284 + count = 0
285 + while seq:
286 + name = name[6:].strip()
287 + count = count + 1
288 + file_path = "results/test_" + name + ".json_pm" + function +"_" + estimator
289 + if os.path.isfile(file_path):
290 + file_result = open(file_path, "r")
291 + list_struct2d_F = []
292 + list_contacts_F = []
293 + list_name.append(name)
294 + title_prd = file_result.readline()
295 + structure_prd = file_result.readline()
296 + sequence = structure_prd
297 + while structure_prd:
298 + structure_prd = file_result.readline()
299 + if (len(structure_prd) != 0):
300 + mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
301 + mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
302 + list_struct2d_F.append(mcc_str)
303 +
304 + contacts_prd = file_result.readline()
305 + if (len(contacts_prd) == len(contacts)):
306 + mcc_tab = compare_two_contacts(contacts, contacts_prd)
307 + mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
308 + list_contacts_F.append(mcc_ctc)
309 + complete_list_struct2d_F.append(list_struct2d_F)
310 + complete_list_contacts_F.append(list_contacts_F)
311 + name = myfile.readline()
312 + contacts = myfile.readline()
313 + seq = myfile.readline()
314 + structure2d = myfile.readline()
315 + return [list_name, complete_list_struct2d_F, complete_list_contacts_F]
316 + myfile.close()
317 +
318 +def visualization_all_mcc(path_benchmark, estimator, function, color, lines_color):
319 +
320 + list_name = get_list_structs_contacts(path_benchmark, estimator, function)[0]
321 + tab_struct2d = get_list_structs_contacts(path_benchmark, estimator, function)[1]
322 + tab_contacts = get_list_structs_contacts(path_benchmark, estimator, function)[2]
323 +
324 + np_struct2d = np.array(tab_struct2d)
325 + size = len(tab_struct2d)
326 + list_median_str = []
327 + for i in range(size):
328 + list_median_str.append(np.median(np_struct2d[i]))
329 +
330 + data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
331 + boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
332 + absciss = len(data)
333 +
334 + plt.figure(figsize=(25,4),dpi=200)
335 + plt.xticks(rotation=90)
336 + plt.boxplot(data)
337 + for i in range(absciss):
338 + y =data[i]
339 + x = np.random.normal(1 + i, 0.04, size=len(y))
340 + plt.scatter(x, y)
341 + plt.xticks(np.arange(1, absciss + 1), boxName)
342 +
343 + plt.xlabel('nom de la séquence')
344 + plt.ylabel('MCC')
345 + plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + '.png', bbox_inches='tight')
346 +
347 + np_contacts = np.array(tab_contacts)
348 + size = len(tab_contacts)
349 + list_median_ctc = []
350 + for i in range(size):
351 + list_median_ctc.append(np.median(np_contacts[i]))
352 +
353 + data = [x for _, x in sorted(zip(list_median_ctc, tab_contacts))]
354 + boxName = [x for _, x in sorted(zip(list_median_ctc, list_name))]
355 + absciss = len(data)
356 +
357 + plt.figure(figsize=(25, 4), dpi=200)
358 + plt.xticks(rotation=90)
359 + plt.boxplot(data)
360 + for i in range(absciss):
361 + y = data[i]
362 + x = np.random.normal(1 + i, 0.04, size=len(y))
363 + plt.scatter(x, y)
364 + plt.xticks(np.arange(1, absciss + 1), boxName)
365 +
366 + plt.xlabel('nom de la séquence')
367 + plt.ylabel('MCC')
368 + plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '.png', bbox_inches='tight')
253 369
254 #cmd = ("cppsrc/Scripts/create") 370 #cmd = ("cppsrc/Scripts/create")
255 #cmd0 = ("cppsrc/Scripts/addDelimiter") 371 #cmd0 = ("cppsrc/Scripts/addDelimiter")
256 #cmd1 = ("cppsrc/Scripts/countPattern") 372 #cmd1 = ("cppsrc/Scripts/countPattern")
257 #cmd2 = ("cppsrc/Scripts/deletePdb") 373 #cmd2 = ("cppsrc/Scripts/deletePdb")
258 374
259 -myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r") 375 +"""myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r")
260 name = myfile.readline() 376 name = myfile.readline()
261 contacts = myfile.readline() 377 contacts = myfile.readline()
262 seq = myfile.readline() 378 seq = myfile.readline()
...@@ -266,36 +382,43 @@ list_struct2d_E = [] ...@@ -266,36 +382,43 @@ list_struct2d_E = []
266 list_contacts_E = [] 382 list_contacts_E = []
267 list_struct2d_F = [] 383 list_struct2d_F = []
268 list_contacts_F = [] 384 list_contacts_F = []
269 -count = 0 385 +countE = 0
386 +countF = 0
270 while seq: 387 while seq:
271 name = name[6:].strip() 388 name = name[6:].strip()
272 print(name) 389 print(name)
273 - """
274 run_test(cmd2 + " " + name + ".fa", log) 390 run_test(cmd2 + " " + name + ".fa", log)
275 print(cmd2 + " " + name + ".fa") 391 print(cmd2 + " " + name + ".fa")
276 - """ 392 +
277 - cmd3 = create_command(name) 393 + cmd3 = create_command_E(name)
278 os.system(cmd3) 394 os.system(cmd3)
279 395
280 - """file_path = "results/test_" + name + ".json_pmE_MEA" 396 + file_path = "results/test_" + name + ".json_pmE_MFE"
281 if os.path.isfile(file_path): 397 if os.path.isfile(file_path):
282 tabE = write_mcc_in_file_E(name, contacts, structure2d) 398 tabE = write_mcc_in_file_E(name, contacts, structure2d)
283 list_contacts_E.append(tabE[0]) 399 list_contacts_E.append(tabE[0])
284 - list_struct2d_E.append(tabE[1])""" 400 + list_struct2d_E.append(tabE[1])
401 + countE = countE + 1
402 +
403 + cmd3 = create_command_F(name)
404 + os.system(cmd3)
285 405
286 file_path = "results/test_" + name + ".json_pmF_MFE" 406 file_path = "results/test_" + name + ".json_pmF_MFE"
287 if os.path.isfile(file_path): 407 if os.path.isfile(file_path):
288 tabF = write_mcc_in_file_F(name, contacts, structure2d) 408 tabF = write_mcc_in_file_F(name, contacts, structure2d)
289 list_contacts_F.append(tabF[0]) 409 list_contacts_F.append(tabF[0])
290 list_struct2d_F.append(tabF[1]) 410 list_struct2d_F.append(tabF[1])
291 - count = count + 1 411 + countF = countF + 1
292 412
293 name = myfile.readline() 413 name = myfile.readline()
294 contacts = myfile.readline() 414 contacts = myfile.readline()
295 seq = myfile.readline() 415 seq = myfile.readline()
296 structure2d = myfile.readline() 416 structure2d = myfile.readline()
297 417
298 -"""visualization(list_struct2d_E, list_contacts_E, 'E', 'red', '#900C3F')""" 418 +visualization_best_mcc(list_struct2d_E, list_contacts_E, 'E', 'red', '#900C3F')
299 -visualization(list_struct2d_F, list_contacts_F, 'F', 'blue', '#0900FF') 419 +visualization_best_mcc(list_struct2d_F, list_contacts_F, 'F', 'blue', '#0900FF')
300 -print("count: " + str(count) + "\n") 420 +print("countE: " + str(countE) + "\n")
301 -myfile.close() 421 +print("countF: " + str(countF) + "\n")
422 +myfile.close()"""
423 +path_benchmark = "data/modules/ISAURE/Motifs_version_initiale/benchmark.txt"
424 +visualization_all_mcc(path_benchmark,'MEA', 'F', 'blue', '#0900FF')
...\ No newline at end of file ...\ No newline at end of file
......