Ajout de fonctions pour faire un boxplot pour les MCC de chaque séquence du benchmark

Nathalie BERNARD
Commit 55bb6176ae96b106809bb13e0f25823a02231cca 55bb6176 1 parent c6bdd341
Showing 1 changed file with 140 additions and 17 deletions
Isaure_benchmark.py
--- a/Isaure_benchmark.py
View file @55bb617
+++ b/Isaure_benchmark.py
View file @55bb617
@@ -5,6 +5,8 @@ import os.path
 from math import sqrt, ceil
 import numpy as np
 import matplotlib.pyplot as plt
+ import seaborn as sns
+ import pandas as pd
 
 
 log_path = "test.log"
@@ -22,7 +24,20 @@ def run_test(cmd, log):
         log.flush()
     rc = process.poll()
 
- def create_command(name):
+ def create_command_E(name):
+     #cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
+     cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
+       "/local/local/BiorseoNath/data/fasta/" +
+       name + ".fa  " +
+       "-O results/ " +
+       "--contacts " +
+       "--patternmatch " +
+       "--func E --MFE -v " +
+       "--biorseo-dir /local/local/BiorseoNath " +
+       "--modules-path /local/local/BiorseoNath/data/modules/ISAURE/Motifs_derniere_version ")
+     return cmd
+ 
+ def create_command_F(name):
     #cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
     cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
       "/local/local/BiorseoNath/data/fasta/" +
@@ -129,8 +144,8 @@ def specificity(tp, tn, fp, fn):
 # ================== Code from Louis Beckey Benchark.py ==============================
 
 def write_mcc_in_file_E(sequence_id, true_contacts, true_structure):
-     read_prd = open("results/test_" + sequence_id + ".json_pmE_MEA", "r")
-     write = open("results/test_" + sequence_id + ".mcc_E_MEA", "w")
+     read_prd = open("results/test_" + sequence_id + ".json_pmE_MFE", "r")
+     write = open("results/test_" + sequence_id + ".mcc_E_MFE", "w")
 
     max_mcc_str = -1;
     max_mcc_ctc = -1;
@@ -223,12 +238,16 @@ def set_axis_style(ax, labels):
     ax.set_xlim(0.25, len(labels) + 0.75)
     ax.set_xlabel('Sample name')
 
- def visualization(list_struct2d, list_contacts, function, color, lines_color):
+ def visualization_best_mcc(list_struct2d, list_contacts, function, color, lines_color):
 
     np_struct2d = np.array(list_struct2d)
     np_contacts = np.array(list_contacts)
 
     data_to_plot = [np_struct2d, np_contacts]
+     median_2d = np.median(np_struct2d)
+     median_ctc = np.median(np_contacts)
+     print("mediane 2D: " + str(median_2d) + "\n")
+     print("mediane ctc: " + str(median_ctc) + "\n")
 
     fig = plt.figure()
 
@@ -249,14 +268,111 @@ def visualization(list_struct2d, list_contacts, function, color, lines_color):
 
     for v in violins['bodies']:
         v.set_facecolor(color)
-     plt.savefig('visualisation' + function + '.png', bbox_inches='tight')
+     plt.savefig('visualisation_16_06_MFE_' + function + '.png', bbox_inches='tight')
+ 
+ def get_list_structs_contacts(path_benchmark, estimator, function):
+     myfile = open(path_benchmark, "r")
+     list_name = []
+ 
+     complete_list_struct2d_F = []
+     complete_list_contacts_F = []
+ 
+     name = myfile.readline()
+     contacts = myfile.readline()
+     seq = myfile.readline()
+     structure2d = myfile.readline()
+     count = 0
+     while seq:
+         name = name[6:].strip()
+         count = count + 1
+         file_path = "results/test_" + name + ".json_pm" + function +"_" + estimator
+         if os.path.isfile(file_path):
+             file_result = open(file_path, "r")
+             list_struct2d_F = []
+             list_contacts_F = []
+             list_name.append(name)
+             title_prd = file_result.readline()
+             structure_prd = file_result.readline()
+             sequence = structure_prd
+             while structure_prd:
+                 structure_prd = file_result.readline()
+                 if (len(structure_prd) != 0):
+                     mcc_tab = compare_two_structures(structure2d, structure_prd[:len(sequence)])
+                     mcc_str = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
+                     list_struct2d_F.append(mcc_str)
+ 
+                     contacts_prd = file_result.readline()
+                     if (len(contacts_prd) == len(contacts)):
+                         mcc_tab = compare_two_contacts(contacts, contacts_prd)
+                         mcc_ctc = mattews_corr_coeff(mcc_tab[0], mcc_tab[1], mcc_tab[2], mcc_tab[3])
+                         list_contacts_F.append(mcc_ctc)
+             complete_list_struct2d_F.append(list_struct2d_F)
+             complete_list_contacts_F.append(list_contacts_F)
+         name = myfile.readline()
+         contacts = myfile.readline()
+         seq = myfile.readline()
+         structure2d = myfile.readline()
+     return [list_name, complete_list_struct2d_F, complete_list_contacts_F]
+     myfile.close()
+ 
+ def visualization_all_mcc(path_benchmark, estimator, function, color, lines_color):
+ 
+     list_name = get_list_structs_contacts(path_benchmark, estimator, function)[0]
+     tab_struct2d = get_list_structs_contacts(path_benchmark, estimator, function)[1]
+     tab_contacts = get_list_structs_contacts(path_benchmark, estimator, function)[2]
+ 
+     np_struct2d = np.array(tab_struct2d)
+     size = len(tab_struct2d)
+     list_median_str = []
+     for i in range(size):
+         list_median_str.append(np.median(np_struct2d[i]))
+ 
+     data = [x for _, x in sorted(zip(list_median_str, tab_struct2d))]
+     boxName = [x for _, x in sorted(zip(list_median_str, list_name))]
+     absciss = len(data)
+ 
+     plt.figure(figsize=(25,4),dpi=200)
+     plt.xticks(rotation=90)
+     plt.boxplot(data)
+     for i in range(absciss):
+         y =data[i]
+         x = np.random.normal(1 + i, 0.04, size=len(y))
+         plt.scatter(x, y)
+         plt.xticks(np.arange(1, absciss + 1), boxName)
+ 
+     plt.xlabel('nom de la séquence')
+     plt.ylabel('MCC')
+     plt.savefig('visualisation_128arn_structure2d_' + estimator + "_" + function + '.png', bbox_inches='tight')
+ 
+     np_contacts = np.array(tab_contacts)
+     size = len(tab_contacts)
+     list_median_ctc = []
+     for i in range(size):
+         list_median_ctc.append(np.median(np_contacts[i]))
+ 
+     data = [x for _, x in sorted(zip(list_median_ctc, tab_contacts))]
+     boxName = [x for _, x in sorted(zip(list_median_ctc, list_name))]
+     absciss = len(data)
+ 
+     plt.figure(figsize=(25, 4), dpi=200)
+     plt.xticks(rotation=90)
+     plt.boxplot(data)
+     for i in range(absciss):
+         y = data[i]
+         x = np.random.normal(1 + i, 0.04, size=len(y))
+         plt.scatter(x, y)
+         plt.xticks(np.arange(1, absciss + 1), boxName)
+ 
+     plt.xlabel('nom de la séquence')
+     plt.ylabel('MCC')
+     plt.savefig('visualisation_128arn_contacts_' + estimator + "_" + function + '.png', bbox_inches='tight')
 
 #cmd = ("cppsrc/Scripts/create")
 #cmd0 = ("cppsrc/Scripts/addDelimiter")
 #cmd1 = ("cppsrc/Scripts/countPattern")
 #cmd2 = ("cppsrc/Scripts/deletePdb")
 
- myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r")
+ """myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r")
 name = myfile.readline()
 contacts = myfile.readline()
 seq = myfile.readline()
@@ -266,36 +382,43 @@ list_struct2d_E = []
 list_contacts_E = []
 list_struct2d_F = []
 list_contacts_F = []
- count = 0
+ countE = 0
+ countF = 0
 while seq:
     name = name[6:].strip()
     print(name)
-     """
     run_test(cmd2 + " " + name + ".fa", log)
     print(cmd2 + " " + name + ".fa")
-     """
-     cmd3 = create_command(name)
+     
+     cmd3 = create_command_E(name)
     os.system(cmd3)
 
-     """file_path = "results/test_" + name + ".json_pmE_MEA"
+     file_path = "results/test_" + name + ".json_pmE_MFE"
     if os.path.isfile(file_path):
         tabE = write_mcc_in_file_E(name, contacts, structure2d)
         list_contacts_E.append(tabE[0])
-         list_struct2d_E.append(tabE[1])"""
+         list_struct2d_E.append(tabE[1])
+         countE = countE + 1
+ 
+     cmd3 = create_command_F(name)
+     os.system(cmd3)
 
     file_path = "results/test_" + name + ".json_pmF_MFE"
     if os.path.isfile(file_path):
         tabF = write_mcc_in_file_F(name, contacts, structure2d)
         list_contacts_F.append(tabF[0])
         list_struct2d_F.append(tabF[1])
-         count = count + 1
+         countF = countF + 1
 
     name = myfile.readline()
     contacts = myfile.readline()
     seq = myfile.readline()
     structure2d = myfile.readline()
 
- """visualization(list_struct2d_E, list_contacts_E, 'E', 'red', '#900C3F')"""
- visualization(list_struct2d_F, list_contacts_F, 'F', 'blue', '#0900FF')
- print("count: " + str(count) + "\n")
- myfile.close()
+ visualization_best_mcc(list_struct2d_E, list_contacts_E, 'E', 'red', '#900C3F')
+ visualization_best_mcc(list_struct2d_F, list_contacts_F, 'F', 'blue', '#0900FF')
+ print("countE: " + str(countE) + "\n")
+ print("countF: " + str(countF) + "\n")
+ myfile.close()"""
+ path_benchmark = "data/modules/ISAURE/Motifs_version_initiale/benchmark.txt"
+ visualization_all_mcc(path_benchmark,'MEA', 'F', 'blue', '#0900FF')
\ No newline at end of file