Louis BECQUEY

Classe Motif & début de loader pour RNA3dMotif

#include "Motif.h"
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <iostream>
#include <regex>
#include <sstream>
using namespace boost::filesystem;
struct recursive_directory_range {
typedef recursive_directory_iterator iterator;
recursive_directory_range(path p) : p_(p) {}
iterator begin() { return recursive_directory_iterator(p_); }
iterator end() { return recursive_directory_iterator(); }
path p_;
};
Motif::Motif(void) {}
void Motif::build_from_desc(const string& descfile)
{
std::ifstream motif;
string line;
string seq;
vector<string> component_sequences;
vector<string> bases;
int last;
PDBID = descfile.substr(0, descfile.find(".desc"));
is_model_ = false;
reversed_ = false;
source_ = RNA3DMOTIF;
motif = std::ifstream(descfile);
std::getline(motif, line); // ignore "id: number"
std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc...
seq = bases[1].back();
last = std::stoi(bases[1].substr(0, bases[1].find('_')));
for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) {
char nt = b->back();
int pos = std::stoi(b->substr(0, b->find('_')));
if (pos - last > 5) { // finish this component and start a new one
component_sequences.push_back(seq);
seq = nt;
} else if (pos - last == 1) { // we are on the same component
seq += nt;
} else if (pos - last == 2) {
seq += '.' + nt;
} else if (pos - last == 3) {
seq += ".." + nt;
} else if (pos - last == 4) {
seq += "..." + nt;
} else if (pos - last == 5) {
seq += "...." + nt;
}
}
// Now component_sequences is a vector of sequences like {AGCGC, CGU..GUUU}
for (string& comp : component_sequences) {
}
}
void Motif::load_from_csv(string csv_line)
{
vector<string> tokens;
split(tokens, csv_line, boost::is_any_of(","));
atlas_id = tokens[0];
score_ = stoi(tokens[2]);
comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
if (tokens[5] != "-") comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
reversed_ = (tokens[1] == "True");
is_model_ = true;
PDBID = "";
source_ = RNAMOTIFATLAS;
}
string Motif::pos_string(void) const
{
std::stringstream s;
s << atlas_id << " ( ";
for (auto c : comp) s << c.pos.first << '-' << c.pos.second << ' ';
s << ')';
return s.str();
}
string Motif::get_identifier(void) const
{
switch (source_) {
case RNAMOTIFATLAS: return atlas_id; break;
default: return PDBID;
}
}
vector<Motif> load_desc_folder(const string& path, const string& rna)
{
vector<Motif> posInsertionSites;
if (!exists(path)) {
std::cerr << "Hmh, i can't find that folder: " << path << std::endl;
return posInsertionSites;
}
for (auto it : recursive_directory_range(path)) {
if (is_desc_insertible(it.path().string(), rna)) {
posInsertionSites.push_back(Motif());
posInsertionSites.back().build_from_desc(it.path().string());
}
}
return posInsertionSites;
}
vector<Motif> load_jar3d_output(const string& path)
{
vector<Motif> posInsertionSites;
std::ifstream motifs;
string line;
motifs = std::ifstream(path);
std::getline(motifs, line); // skip header
while (std::getline(motifs, line)) {
posInsertionSites.push_back(Motif());
posInsertionSites.back().load_from_csv(line);
}
return posInsertionSites;
}
bool is_desc_insertible(const string& descfile, const string& rna, bool verbose)
{
std::ifstream motif;
string line;
string seq;
vector<string> bases;
int last;
motif = std::ifstream(descfile);
std::getline(motif, line); // ignore "id: number"
std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc...
seq = bases[1].back();
last = std::stoi(bases[1].substr(0, bases[1].find('_')));
for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) {
char nt = b->back();
int pos = std::stoi(b->substr(0, b->find('_')));
if (pos - last > 5) { // finish this component and start a new one
seq += ".{5,}" + nt;
} else if (pos - last == 1) { // we are on the same component
seq += nt;
} else if (pos - last == 2) {
seq += "." + nt;
} else if (pos - last == 3) {
seq += ".." + nt;
} else if (pos - last == 4) {
seq += "..." + nt;
} else if (pos - last == 5) {
seq += "...." + nt;
}
last = pos;
}
std::smatch m;
std::regex e(seq);
if (std::regex_search(rna, m, e)) {
if (verbose)
std::cout << "Motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << " can be inserted." << std::endl;
return true;
} else {
if (verbose)
std::cout << "Ignoring motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << std::endl;
return false;
}
}
bool operator==(const Component& c1, const Component& c2)
{
if (c1.pos.first != c2.pos.first) return false;
if (c1.pos.second != c2.pos.second) return false;
return true;
}
bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); }
bool operator==(const Motif& m1, const Motif& m2)
{
if (m1.get_identifier() != m2.get_identifier()) return false;
if (m1.score_ != m2.score_) return false;
if (m1.reversed_ != m2.reversed_) return false;
for (uint i = 0; i < m1.comp.size(); i++)
if (m1.comp[i] != m2.comp[i]) return false;
return true;
}
bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); }
#ifndef MOTIF_H_
#define MOTIF_H_
#include <string>
#include <vector>
using std::pair;
using std::string;
using std::vector;
typedef struct Comp_ {
pair<uint, uint> pos;
size_t k;
string seq_;
Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
} Component;
class Motif
{
public:
Motif();
void load_from_csv(string csv_line);
void build_from_desc(const string& descfile);
string pos_string(void) const;
string get_origin(void) const;
string get_identifier(void) const;
vector<Component> comp;
double score_;
bool reversed_;
private:
string atlas_id; // if source = RNAMOTIFATLAS
string PDBID; // if source = RNA3DMOTIF
bool is_model_; // Wether the motif is a model or an extracted module from a 3D structure
enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3 } source_;
};
bool is_desc_insertible(const string& descfile, const string& rna);
vector<Motif> load_desc_folder(const string& path);
vector<Motif> load_jar3d_output(const string& path);
// utilities to compare secondary structures:
bool operator==(const Motif& m1, const Motif& m2);
bool operator!=(const Motif& m1, const Motif& m2);
bool operator==(const Component& c1, const Component& c2);
bool operator!=(const Component& c1, const Component& c2);
#endif // MOTIF_H_
\ No newline at end of file
......@@ -104,7 +104,7 @@ string SecondaryStructure::to_string(void) const
{
string s;
s += to_DBN();
for (const Motif& m : motif_info_) s += " + " + m.atlas_id;
for (const Motif& m : motif_info_) s += " + " + m.get_identifier();
s += "\t" + boost::str(boost::format("%.7f") % objective_scores_[0]) + "\t" +
boost::str(boost::format("%.7f") % objective_scores_[1]);
return s;
......@@ -164,7 +164,7 @@ bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j)
bool motif_sorter(Motif& m1, Motif& m2)
{
if (m1.atlas_id.compare(m2.atlas_id) < 0) return true;
if (m1.get_identifier().compare(m2.get_identifier()) < 0) return true;
return false;
}
......@@ -282,30 +282,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2)
return false;
}
bool operator==(const Component& c1, const Component& c2)
{
if (c1.pos.first != c2.pos.first) return false;
if (c1.pos.second != c2.pos.second) return false;
return true;
}
bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); }
bool operator==(const Motif& m1, const Motif& m2)
{
if (m1.atlas_id != m2.atlas_id) return false;
if (m1.score != m2.score) return false;
if (m1.reversed != m2.reversed) return false;
for (uint i = 0; i < m1.comp.size(); i++)
if (m1.comp[i] != m2.comp[i]) return false;
return true;
}
bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); }
bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2)
{
......
#ifndef __INC_IP_SOL__
#define __INC_IP_SOL__
#define IL_STD
#ifndef SECONDARY_STRUCTURE_
#define SECONDARY_STRUCTURE_
#include "Motif.h"
#include "rna.h"
#include <iostream>
#include <string>
......@@ -12,28 +11,6 @@ using std::pair;
using std::string;
using std::vector;
typedef struct Comp_ {
pair<uint, uint> pos;
size_t k;
Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
} Component;
typedef struct {
string atlas_id;
vector<Component> comp;
bool reversed;
int score;
string pos_string(void) const
{
std::stringstream s;
s << atlas_id << " ( ";
for (auto c : comp) {
s << c.pos.first << '-' << c.pos.second << ' ';
}
s << ')';
return s.str();
}
} Motif;
class SecondaryStructure
{
......@@ -56,11 +33,11 @@ class SecondaryStructure
vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure
vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program
vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions
size_t n_; // length of the RNA
size_t nBP_; // number of basepairs
RNA rna_; // RNA object which is folded
bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore
vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions
size_t n_; // length of the RNA
size_t nBP_; // number of basepairs
RNA rna_; // RNA object which is folded
bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore
};
// return if this SecondaryStructure s1 dominates s2
......@@ -72,11 +49,7 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2);
// return wether SecondaryStructures are identical or not
bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2);
bool operator!=(const SecondaryStructure& s1, const SecondaryStructure& s2);
// utilities to compare secondary structures:
bool operator==(const Motif& m1, const Motif& m2);
bool operator!=(const Motif& m1, const Motif& m2);
bool operator==(const Component& c1, const Component& c2);
bool operator!=(const Component& c1, const Component& c2);
bool motif_sorter(Motif& m1, Motif& m2);
bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j);
......@@ -85,4 +58,5 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
#endif
#endif // SECONDARY_STRUCTURE_
\ No newline at end of file
......
......@@ -3,7 +3,6 @@
***/
#include <algorithm>
#include <boost/algorithm/string.hpp>
#include <cstdlib>
#include <iostream>
#include <iterator>
......@@ -12,6 +11,7 @@
#include <vector>
#include "MOIP.h"
#include "Motif.h"
#include "fa.h"
using namespace std;
......@@ -49,19 +49,6 @@ string remove_ext(const char* mystr, char dot, char sep)
return string(retstr);
}
Motif parse_csv_line(string line)
{
vector<string> tokens;
boost::split(tokens, line, boost::is_any_of(","));
Motif m;
m.atlas_id = tokens[0];
m.score = stoi(tokens[2]);
m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
if (tokens[5] != "-") m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
m.reversed = (tokens[1] == "True");
return m;
}
int main(int argc, char* argv[])
{
/* ARGUMENT CHECKING */
......@@ -81,8 +68,6 @@ int main(int argc, char* argv[])
string basename = remove_ext(inputName, '.', '/');
float theta_p_threshold = atof(argv[3]);
list<Fasta> f;
string line;
ifstream motifs;
vector<Motif> posInsertionSites;
ofstream outfile;
SecondaryStructure bestSSO1, bestSSO2;
......@@ -108,9 +93,7 @@ int main(int argc, char* argv[])
cerr << csvname << " not found" << endl;
return EXIT_FAILURE;
}
motifs = ifstream(csvname);
getline(motifs, line); // skip header
while (getline(motifs, line)) posInsertionSites.push_back(parse_csv_line(line));
posInsertionSites = load_desc_folder(csvname);
if (verbose)
cout << "\t>" << csvname << " successfuly loaded (" << posInsertionSites.size() << " insertion sites)" << endl;
......