Nathalie BERNARD

Scripts corrigés pour le matching unique et la suppression des pdbs

......@@ -24,7 +24,7 @@ def run_test(cmd, log):
log.flush()
rc = process.poll()
def create_command_E(name):
def create_command_E(name, estimator):
#cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
"/local/local/BiorseoNath/data/fasta/" +
......@@ -32,12 +32,12 @@ def create_command_E(name):
"-O results/ " +
"--contacts " +
"--patternmatch " +
"--func E --MFE -v " +
"--func E --" + estimator + " -v " +
"--biorseo-dir /local/local/BiorseoNath " +
"--modules-path /local/local/BiorseoNath/data/modules/ISAURE/Motifs_derniere_version ")
return cmd
def create_command_F(name):
def create_command_F(name, estimator):
#cmd = ("python3 /mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/biorseo.py -i " +
cmd = ("python3 /local/local/BiorseoNath/biorseo.py -i " +
"/local/local/BiorseoNath/data/fasta/" +
......@@ -45,7 +45,7 @@ def create_command_F(name):
"-O results/ " +
"--contacts " +
"--patternmatch " +
"--func F --MFE -v " +
"--func F --" + estimator + " -v " +
"--biorseo-dir /local/local/BiorseoNath " +
"--modules-path /local/local/BiorseoNath/data/modules/ISAURE/Motifs_derniere_version ")
return cmd
......@@ -238,7 +238,7 @@ def set_axis_style(ax, labels):
ax.set_xlim(0.25, len(labels) + 0.75)
ax.set_xlabel('Sample name')
def visualization_best_mcc(list_struct2d, list_contacts, function, color, lines_color):
def visualization_best_mcc(list_struct2d, list_contacts, estimator, function, color, lines_color):
np_struct2d = np.array(list_struct2d)
np_contacts = np.array(list_contacts)
......@@ -268,7 +268,7 @@ def visualization_best_mcc(list_struct2d, list_contacts, function, color, lines_
for v in violins['bodies']:
v.set_facecolor(color)
plt.savefig('visualisation_16_06_MFE_' + function + '.png', bbox_inches='tight')
plt.savefig('visualisation_16_06_' + estimator + '_' + function + '.png', bbox_inches='tight')
def get_list_structs_contacts(path_benchmark, estimator, function):
myfile = open(path_benchmark, "r")
......@@ -333,7 +333,7 @@ def visualization_all_mcc(path_benchmark, estimator, function):
plt.figure(figsize=(25,4),dpi=200)
plt.xticks(rotation=90)
plt.boxplot(data)
plt.boxplot(data, medianprops=dict(color='black'))
for i in range(absciss):
y =data[i]
x = np.random.normal(1 + i, 0.04, size=len(y))
......@@ -356,7 +356,7 @@ def visualization_all_mcc(path_benchmark, estimator, function):
plt.figure(figsize=(25, 4), dpi=200)
plt.xticks(rotation=90)
plt.boxplot(data)
plt.boxplot(data, medianprops=dict(color='black'))
for i in range(absciss):
y = data[i]
x = np.random.normal(1 + i, 0.04, size=len(y))
......@@ -372,53 +372,90 @@ def visualization_all_mcc(path_benchmark, estimator, function):
#cmd1 = ("cppsrc/Scripts/countPattern")
#cmd2 = ("cppsrc/Scripts/deletePdb")
"""myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r")
myfile = open("data/modules/ISAURE/Motifs_version_initiale/benchmark.txt", "r")
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
list_struct2d_E = []
list_contacts_E = []
list_struct2d_F = []
list_contacts_F = []
countE = 0
countF = 0
list_struct2d_E_MFE = []
list_contacts_E_MFE = []
list_struct2d_F_MFE = []
list_contacts_F_MFE = []
list_struct2d_E_MEA = []
list_contacts_E_MEA = []
list_struct2d_F_MEA = []
list_contacts_F_MEA = []
countE_MFE = 0
countF_MFE = 0
countE_MEA = 0
countF_MEA = 0
while seq:
name = name[6:].strip()
print(name)
run_test(cmd2 + " " + name + ".fa", log)
print(cmd2 + " " + name + ".fa")
cmd2 = ("cppsrc/Scripts/deletePdb " + name)
cmd3 = create_command_E(name)
cmd3 = create_command_E(name, 'MFE')
os.system(cmd3)
file_path = "results/test_" + name + ".json_pmE_MFE"
if os.path.isfile(file_path):
tabE = write_mcc_in_file_E(name, contacts, structure2d)
list_contacts_E.append(tabE[0])
list_struct2d_E.append(tabE[1])
countE = countE + 1
tabE_MFE = write_mcc_in_file_E(name, contacts, structure2d)
list_contacts_E_MFE.append(tabE_MFE[0])
list_struct2d_E_MFE.append(tabE_MFE[1])
countE_MFE = countE_MFE + 1
cmd3 = create_command_F(name)
cmd3 = create_command_F(name, 'MFE')
os.system(cmd3)
file_path = "results/test_" + name + ".json_pmF_MFE"
if os.path.isfile(file_path):
tabF = write_mcc_in_file_F(name, contacts, structure2d)
list_contacts_F.append(tabF[0])
list_struct2d_F.append(tabF[1])
countF = countF + 1
tabF_MFE = write_mcc_in_file_F(name, contacts, structure2d)
list_contacts_F_MFE.append(tabF_MFE[0])
list_struct2d_F_MFE.append(tabF_MFE[1])
countF_MFE = countF_MFE + 1
cmd3 = create_command_E(name, 'MEA')
os.system(cmd3)
file_path = "results/test_" + name + ".json_pmE_MEA"
if os.path.isfile(file_path):
tabE_MEA = write_mcc_in_file_E(name, contacts, structure2d)
list_contacts_E_MEA.append(tabE_MEA[0])
list_struct2d_E_MEA.append(tabE_MEA[1])
countE_MEA = countE_MEA + 1
cmd3 = create_command_F(name, 'MEA')
os.system(cmd3)
file_path = "results/test_" + name + ".json_pmF_MEA"
if os.path.isfile(file_path):
tabF_MEA = write_mcc_in_file_F(name, contacts, structure2d)
list_contacts_F_MEA.append(tabF_MEA[0])
list_struct2d_F_MEA.append(tabF_MEA[1])
countF_MEA = countF_MEA + 1
name = myfile.readline()
contacts = myfile.readline()
seq = myfile.readline()
structure2d = myfile.readline()
visualization_best_mcc(list_struct2d_E, list_contacts_E, 'E', 'red', '#900C3F')
visualization_best_mcc(list_struct2d_F, list_contacts_F, 'F', 'blue', '#0900FF')
print("countE: " + str(countE) + "\n")
print("countF: " + str(countF) + "\n")
myfile.close()"""
path_benchmark = "data/modules/ISAURE/Motifs_version_initiale/benchmark.txt"
visualization_all_mcc(path_benchmark,'MEA', 'F')
\ No newline at end of file
visualization_best_mcc(list_struct2d_E_MFE, list_contacts_E_MFE, 'MFE', 'E', 'red', '#900C3F')
visualization_best_mcc(list_struct2d_F_MFE, list_contacts_F_MFE, 'MFE', 'F', 'blue', '#0900FF')
visualization_best_mcc(list_struct2d_E_MEA, list_contacts_E_MEA, 'MEA', 'E', 'red', '#900C3F')
visualization_best_mcc(list_struct2d_F_MEA, list_contacts_F_MEA, 'MEA', 'F', 'blue', '#0900FF')
print("countE_MFE: " + str(countE_MFE) + "\n")
print("countF_MFE: " + str(countF_MFE) + "\n")
print("countE_MEA: " + str(countE_MEA) + "\n")
print("countF_MEA: " + str(countF_MEA) + "\n")
myfile.close()
#path_benchmark = "data/modules/ISAURE/Motifs_version_initiale/benchmark.txt"
#visualization_all_mcc(path_benchmark,'MEA', 'F')
#visualization_all_mcc(path_benchmark,'MEA', 'E')
#visualization_all_mcc(path_benchmark,'MFE', 'E')
#visualization_all_mcc(path_benchmark,'MFE', 'F')
\ No newline at end of file
......
......@@ -12,45 +12,31 @@
using namespace std;
using json = nlohmann::json;
void delete_redundant_pdb(const string& jsonlibrary, const string& fasta, const string& jsonoutfile) {
void delete_redundant_pdb(const string& jsonlibrary, const string& name, const string& jsonoutfile) {
std::ifstream lib(jsonlibrary);
std::ofstream outfile (jsonoutfile);
json new_motif;
json new_id;
json js = json::parse(lib);
std::ifstream file(fasta);
string pdb, seq;
std::getline(file, pdb);
std::getline(file, seq);
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
vector<string> list_pdbs;
bool is_added = true;
//cout << "id: " << id << endl;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string test = it2.key();
string field = it2.key();
if (!test.compare("pdb")) {
if (!field.compare("pdb")) {
vector<string> tab = it2.value();
list_pdbs = tab;
/*set<set<string>>::iterator iit;
set<string>::iterator iit2;
for(iit = list_pfams.begin(); iit != list_pfams.end(); iit++) {
for (iit2 = iit->begin(); iit2 != iit->end(); ++iit2) {
cout << *iit2 << endl;
}
cout << endl << endl;
}*/
} else {
new_id[test] = it2.value();
new_id[field] = it2.value();
}
}
if (count(list_pdbs.begin(), list_pdbs.end(), pdb.substr(6,pdb.size()))) {
if (count(list_pdbs.begin(), list_pdbs.end(), name.substr(0, name.size()-2))) {
is_added = false;
}
if (is_added) {
......@@ -66,10 +52,9 @@ void delete_redundant_pdb(const string& jsonlibrary, const string& fasta, const
int main(int argc, char** argv)
{
string jsonlibrary = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/motifs_final.json";
string fasta = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/fasta/";
string out = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
fasta = fasta + argv[1];
delete_redundant_pdb(jsonlibrary, fasta, out);
string name = argv[1];
delete_redundant_pdb(jsonlibrary, name, out);
return 0;
}
......
......@@ -15,10 +15,19 @@ using json = nlohmann::json;
That script will remove from the library all the pattern that match ONLY with the sequence from which it comes from.
*/
vector<string> get_list_pdb_benchmark(const string& benchmark) {
struct data {
string pdb;
string seq_pdb;
string id;
string cmp;
};
typedef struct data data;
vector<data> get_list_pdb_benchmark(const string& benchmark) {
fstream bm(benchmark);
vector<string> list_pdb;
vector<data> list_pdb_seq;
if (bm.is_open()) {
string name;
string sequence;
......@@ -26,17 +35,20 @@ vector<string> get_list_pdb_benchmark(const string& benchmark) {
string contacts;
while (getline(bm, name)) {
data d;
int size = name.size();
name = name.substr(5,size-6);
list_pdb.push_back(name);
getline(bm, sequence);
d.pdb = name;
d.seq_pdb = sequence;
list_pdb_seq.push_back(d);
getline(bm, structure);
getline(bm, contacts);
}
bm.close();
}
return list_pdb;
return list_pdb_seq;
}
string trim(string str) {
......@@ -45,101 +57,118 @@ string trim(string str) {
return str;
}
string find_id_pattern(string& pdb_pattern, const string& benchmark) {
vector<string> l = get_list_pdb_benchmark(benchmark);
for (string pdb_bm : l) {
int size = pdb_bm.size();
string cmp = pdb_bm.substr(0, size-2);
data find_id_pattern(string& pdb_pattern, const string& benchmark) {
vector<data> l = get_list_pdb_benchmark(benchmark);
int size = l.size();
for (data d : l) {
string cmp = d.pdb;
cmp = cmp.substr(0, d.pdb.size()-2);
if (!cmp.compare(pdb_pattern)) {
return pdb_bm;
return d;
}
}
return string();
return data();
}
vector<pair<string, string>> find_id(const string& bibli, const string& benchmark) {
vector<data> find_id(const string& bibli, const string& benchmark) {
ifstream lib(bibli);
json js = json::parse(lib);
vector<pair<string, string>> association;
//nam seq_bm et id seq_id
vector<data> association;
for (auto it = js.begin(); it != js.end(); ++it) {
string id = it.key();
data d;
for (auto it2 = js[id].begin(); it2 != js[id].end(); ++it2) {
string field = it2.key();
string seq;
if (!field.compare("pdb")) {
int n = js[id][field].size();
for (int i = 0; i < n ; i++) {
ostringstream stream;
stream << js[id][field][i];
string pdb = trim(stream.str());
string pdb_complete = find_id_pattern(pdb, benchmark);
if (!(pdb_complete.empty())) {
pair<string, string> p;
p.first = pdb_complete;
p.second = id;
association.push_back(p);
}
d = find_id_pattern(pdb, benchmark);
}
}
if (!field.compare("sequence")) {
seq = it2.value();
if (!(d.pdb.empty())) {
d.id = id;
d.cmp = seq;
association.push_back(d);
}
}
}
}
lib.close();
cout << association.size() << endl;
return association;
}
bool does_it_match(const string& result, const string& id_motif) {
ifstream f_res(result);
if (f_res.is_open()) {
string name;
string seq;
string struc;
string contacts;
bool does_it_match(const string& seq, const string& seq_motif) {
size_t found = seq_motif.find("&");
size_t size = seq_motif.size();
vector<string> list_cmp;
if (found != std::string::npos) {
int count = 1;
string cmp = seq_motif.substr(0, found);
list_cmp.push_back(cmp);
while(found != std::string::npos) {
size_t begin = found;
found = seq_motif.find("&", found + 1);
cmp = seq_motif.substr(begin+1, found-begin-1);
list_cmp.push_back(cmp);
count++;
}
getline(f_res, name);
getline(f_res, seq);
while (getline(f_res, struc)) {
string motif_json = "JSON" + id_motif + " +";
if(struc.find(motif_json, 0) != string::npos) {
return true;
}
motif_json = "JSON" + id_motif + "\n";
if(struc.find(motif_json, 0) != string::npos) {
return true;
}
getline(f_res,contacts);
found = seq.find(list_cmp[0]);
int count2 = 1;
while((found != std::string::npos) && (count2 < count)) {
size_t begin = found;
found = seq.find(list_cmp[count2], found + 1);
count2++;
}
if(count == count2) {
return true;
}
} else {
found = seq.find(seq_motif);
if (found != std::string::npos) {
return true;
}
f_res.close();
}
return false;
}
vector<string> select_not_motif(const string& bibli, const string& benchmark) {
vector<string> selection;
vector<pair<string, string>> association = find_id(bibli, benchmark);
vector<string> list_bm = get_list_pdb_benchmark(benchmark);
string path_begin = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_";
string path_MFE_F = ".json_pmF_MEA";
vector<data> association = find_id(bibli, benchmark);
for (pair<string, string> p : association) {
string id_motif = p.second;
selection.push_back(id_motif);
for (data d : association) {
selection.push_back(d.id);
}
for (pair<string, string> p : association) {
cout << p.first << ", " << p.second << endl;
}
cout << "size: " << association.size() << endl;
for (string pdb : list_bm) {
string path_result = path_begin + pdb + path_MFE_F;
for (pair<string,string> pair : association) {
if (pair.first.substr(0, pair.first.size()-2).compare(pdb.substr(0, pdb.size()-2)) != 0) {
bool test = does_it_match(path_result, pair.second);
for (data d : association) {
for (data d2 : association) {
string seq = d.seq_pdb;
string seq2 = d2.cmp;
bool test = false;
if(d.pdb.substr(0, d.pdb.size()-2) != d2.pdb.substr(0, d2.pdb.size()-2)) {
test = does_it_match(seq, seq2);
if (test) {
//if (!(pair.second.compare("954"))) { cout << "p1: " << pair.first << "pdb: " << pdb << endl;}
auto position = find(selection.begin(), selection.end(), pair.second);
cout << "pdb: " << d.pdb << " vs " << d2.pdb << " " << d2.cmp << " " << d2.id << endl;
auto position = find(selection.begin(), selection.end(), d.id);
if (position != selection.end()) {
int index = position - selection.begin();
selection.erase(selection.begin() + index);
......@@ -161,20 +190,30 @@ int main()
string bibli = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_derniere_version/motifs_final.json";
string benchmark = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/data/modules/ISAURE/Motifs_version_initiale/benchmark.dbn";
/*vector<pair<string, string>> association = find_id(bibli, benchmark);
for (pair<string,string> p : association) {
cout << "<" << p.first << ", " << p.second << ">" << endl;
/*vector<data> v = get_list_pdb_benchmark(benchmark);
for (data d : v) {
cout << d.pdb << ", " << d.seq_pdb << endl;
}*/
/*string name = "1U6P_B";
data d = find_id_pattern(name, benchmark);
cout << "name: " << d.pdb << ", seq: " << d.seq_pdb << endl;*/
/*vector<data> association = find_id(bibli, benchmark);
for (data d : association) {
cout << "<" << d.pdb << ", " << d.seq_pdb << ">, " << "<" << d.id << ", " << d.cmp << ">" << endl;
}*/
/*string seq = "UGCGCUUGGCGUUUUAGAGCUAGAAAUAGCAAGUUAAAAUAAGGCUAGUCCGUUAUCAACUUGAAAAAGUGGCACCGAGUCGGUGCUU";
string seq_motif = "UGCGCUUGGCGUUUUAGAGC&GCAAGUUAAAAUAAGGCUAGUCCGUUAUCAA&UGGCACCGAGUCG&U";
bool test = does_it_match(seq, seq_motif);
cout << test << endl;*/
vector<string> selection = select_not_motif(bibli, benchmark);
for (string str : selection) {
cout << str << ", ";
}
cout << endl;
/*string result = "/mnt/c/Users/natha/Documents/IBISC/biorseo2/biorseo/results/test_1U6P_B.json_pmF_MEA";
bool test = does_it_match(result, "150");
cout << "test : " << test << endl;*/
return 0;
}
\ No newline at end of file
......