Louis BECQUEY

Classe Motif & début de loader pour RNA3dMotif

1 +#include "Motif.h"
2 +#include <boost/algorithm/string.hpp>
3 +#include <boost/filesystem.hpp>
4 +#include <iostream>
5 +#include <regex>
6 +#include <sstream>
7 +
8 +using namespace boost::filesystem;
9 +
10 +
11 +struct recursive_directory_range {
12 + typedef recursive_directory_iterator iterator;
13 + recursive_directory_range(path p) : p_(p) {}
14 +
15 + iterator begin() { return recursive_directory_iterator(p_); }
16 + iterator end() { return recursive_directory_iterator(); }
17 +
18 + path p_;
19 +};
20 +
21 +
22 +Motif::Motif(void) {}
23 +
24 +
25 +void Motif::build_from_desc(const string& descfile)
26 +{
27 + std::ifstream motif;
28 + string line;
29 + string seq;
30 + vector<string> component_sequences;
31 + vector<string> bases;
32 + int last;
33 +
34 + PDBID = descfile.substr(0, descfile.find(".desc"));
35 + is_model_ = false;
36 + reversed_ = false;
37 + source_ = RNA3DMOTIF;
38 +
39 + motif = std::ifstream(descfile);
40 + std::getline(motif, line); // ignore "id: number"
41 + std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
42 + split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc...
43 +
44 + seq = bases[1].back();
45 + last = std::stoi(bases[1].substr(0, bases[1].find('_')));
46 + for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) {
47 + char nt = b->back();
48 + int pos = std::stoi(b->substr(0, b->find('_')));
49 +
50 + if (pos - last > 5) { // finish this component and start a new one
51 + component_sequences.push_back(seq);
52 + seq = nt;
53 + } else if (pos - last == 1) { // we are on the same component
54 + seq += nt;
55 + } else if (pos - last == 2) {
56 + seq += '.' + nt;
57 + } else if (pos - last == 3) {
58 + seq += ".." + nt;
59 + } else if (pos - last == 4) {
60 + seq += "..." + nt;
61 + } else if (pos - last == 5) {
62 + seq += "...." + nt;
63 + }
64 + }
65 + // Now component_sequences is a vector of sequences like {AGCGC, CGU..GUUU}
66 + for (string& comp : component_sequences) {
67 + }
68 +}
69 +
70 +
71 +void Motif::load_from_csv(string csv_line)
72 +{
73 + vector<string> tokens;
74 + split(tokens, csv_line, boost::is_any_of(","));
75 + atlas_id = tokens[0];
76 + score_ = stoi(tokens[2]);
77 + comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
78 + if (tokens[5] != "-") comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
79 + reversed_ = (tokens[1] == "True");
80 + is_model_ = true;
81 + PDBID = "";
82 + source_ = RNAMOTIFATLAS;
83 +}
84 +
85 +
86 +string Motif::pos_string(void) const
87 +{
88 + std::stringstream s;
89 + s << atlas_id << " ( ";
90 + for (auto c : comp) s << c.pos.first << '-' << c.pos.second << ' ';
91 + s << ')';
92 + return s.str();
93 +}
94 +
95 +string Motif::get_identifier(void) const
96 +{
97 + switch (source_) {
98 + case RNAMOTIFATLAS: return atlas_id; break;
99 + default: return PDBID;
100 + }
101 +}
102 +
103 +
104 +
105 +
106 +
107 +
108 +vector<Motif> load_desc_folder(const string& path, const string& rna)
109 +{
110 + vector<Motif> posInsertionSites;
111 +
112 + if (!exists(path)) {
113 + std::cerr << "Hmh, i can't find that folder: " << path << std::endl;
114 + return posInsertionSites;
115 + }
116 +
117 + for (auto it : recursive_directory_range(path)) {
118 + if (is_desc_insertible(it.path().string(), rna)) {
119 + posInsertionSites.push_back(Motif());
120 + posInsertionSites.back().build_from_desc(it.path().string());
121 + }
122 + }
123 + return posInsertionSites;
124 +}
125 +
126 +vector<Motif> load_jar3d_output(const string& path)
127 +{
128 + vector<Motif> posInsertionSites;
129 + std::ifstream motifs;
130 + string line;
131 +
132 + motifs = std::ifstream(path);
133 + std::getline(motifs, line); // skip header
134 + while (std::getline(motifs, line)) {
135 + posInsertionSites.push_back(Motif());
136 + posInsertionSites.back().load_from_csv(line);
137 + }
138 + return posInsertionSites;
139 +}
140 +
141 +bool is_desc_insertible(const string& descfile, const string& rna, bool verbose)
142 +{
143 + std::ifstream motif;
144 + string line;
145 + string seq;
146 + vector<string> bases;
147 + int last;
148 +
149 + motif = std::ifstream(descfile);
150 + std::getline(motif, line); // ignore "id: number"
151 + std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
152 + split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc...
153 +
154 + seq = bases[1].back();
155 + last = std::stoi(bases[1].substr(0, bases[1].find('_')));
156 + for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) {
157 + char nt = b->back();
158 + int pos = std::stoi(b->substr(0, b->find('_')));
159 +
160 + if (pos - last > 5) { // finish this component and start a new one
161 + seq += ".{5,}" + nt;
162 + } else if (pos - last == 1) { // we are on the same component
163 + seq += nt;
164 + } else if (pos - last == 2) {
165 + seq += "." + nt;
166 + } else if (pos - last == 3) {
167 + seq += ".." + nt;
168 + } else if (pos - last == 4) {
169 + seq += "..." + nt;
170 + } else if (pos - last == 5) {
171 + seq += "...." + nt;
172 + }
173 + last = pos;
174 + }
175 + std::smatch m;
176 + std::regex e(seq);
177 + if (std::regex_search(rna, m, e)) {
178 + if (verbose)
179 + std::cout << "Motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << " can be inserted." << std::endl;
180 + return true;
181 + } else {
182 + if (verbose)
183 + std::cout << "Ignoring motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << std::endl;
184 + return false;
185 + }
186 +}
187 +
188 +
189 +bool operator==(const Component& c1, const Component& c2)
190 +{
191 + if (c1.pos.first != c2.pos.first) return false;
192 + if (c1.pos.second != c2.pos.second) return false;
193 + return true;
194 +}
195 +
196 +bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); }
197 +
198 +
199 +
200 +bool operator==(const Motif& m1, const Motif& m2)
201 +{
202 + if (m1.get_identifier() != m2.get_identifier()) return false;
203 + if (m1.score_ != m2.score_) return false;
204 + if (m1.reversed_ != m2.reversed_) return false;
205 + for (uint i = 0; i < m1.comp.size(); i++)
206 + if (m1.comp[i] != m2.comp[i]) return false;
207 + return true;
208 +}
209 +
210 +bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); }
1 +#ifndef MOTIF_H_
2 +#define MOTIF_H_
3 +
4 +#include <string>
5 +#include <vector>
6 +
7 +using std::pair;
8 +using std::string;
9 +using std::vector;
10 +
11 +typedef struct Comp_ {
12 + pair<uint, uint> pos;
13 + size_t k;
14 + string seq_;
15 + Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
16 +} Component;
17 +
18 +
19 +
20 +class Motif
21 +{
22 + public:
23 + Motif();
24 + void load_from_csv(string csv_line);
25 + void build_from_desc(const string& descfile);
26 + string pos_string(void) const;
27 + string get_origin(void) const;
28 + string get_identifier(void) const;
29 + vector<Component> comp;
30 + double score_;
31 + bool reversed_;
32 +
33 + private:
34 + string atlas_id; // if source = RNAMOTIFATLAS
35 + string PDBID; // if source = RNA3DMOTIF
36 + bool is_model_; // Wether the motif is a model or an extracted module from a 3D structure
37 + enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3 } source_;
38 +};
39 +
40 +bool is_desc_insertible(const string& descfile, const string& rna);
41 +vector<Motif> load_desc_folder(const string& path);
42 +vector<Motif> load_jar3d_output(const string& path);
43 +
44 +// utilities to compare secondary structures:
45 +bool operator==(const Motif& m1, const Motif& m2);
46 +bool operator!=(const Motif& m1, const Motif& m2);
47 +bool operator==(const Component& c1, const Component& c2);
48 +bool operator!=(const Component& c1, const Component& c2);
49 +
50 +#endif // MOTIF_H_
...\ No newline at end of file ...\ No newline at end of file
...@@ -104,7 +104,7 @@ string SecondaryStructure::to_string(void) const ...@@ -104,7 +104,7 @@ string SecondaryStructure::to_string(void) const
104 { 104 {
105 string s; 105 string s;
106 s += to_DBN(); 106 s += to_DBN();
107 - for (const Motif& m : motif_info_) s += " + " + m.atlas_id; 107 + for (const Motif& m : motif_info_) s += " + " + m.get_identifier();
108 s += "\t" + boost::str(boost::format("%.7f") % objective_scores_[0]) + "\t" + 108 s += "\t" + boost::str(boost::format("%.7f") % objective_scores_[0]) + "\t" +
109 boost::str(boost::format("%.7f") % objective_scores_[1]); 109 boost::str(boost::format("%.7f") % objective_scores_[1]);
110 return s; 110 return s;
...@@ -164,7 +164,7 @@ bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j) ...@@ -164,7 +164,7 @@ bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j)
164 164
165 bool motif_sorter(Motif& m1, Motif& m2) 165 bool motif_sorter(Motif& m1, Motif& m2)
166 { 166 {
167 - if (m1.atlas_id.compare(m2.atlas_id) < 0) return true; 167 + if (m1.get_identifier().compare(m2.get_identifier()) < 0) return true;
168 return false; 168 return false;
169 } 169 }
170 170
...@@ -282,30 +282,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2) ...@@ -282,30 +282,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2)
282 return false; 282 return false;
283 } 283 }
284 284
285 -bool operator==(const Component& c1, const Component& c2)
286 -{
287 - if (c1.pos.first != c2.pos.first) return false;
288 - if (c1.pos.second != c2.pos.second) return false;
289 - return true;
290 -}
291 -
292 -bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); }
293 -
294 -
295 -
296 -bool operator==(const Motif& m1, const Motif& m2)
297 -{
298 - if (m1.atlas_id != m2.atlas_id) return false;
299 - if (m1.score != m2.score) return false;
300 - if (m1.reversed != m2.reversed) return false;
301 - for (uint i = 0; i < m1.comp.size(); i++)
302 - if (m1.comp[i] != m2.comp[i]) return false;
303 - return true;
304 -}
305 -
306 -bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); }
307 -
308 -
309 285
310 bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2) 286 bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2)
311 { 287 {
......
1 -#ifndef __INC_IP_SOL__ 1 +#ifndef SECONDARY_STRUCTURE_
2 -#define __INC_IP_SOL__ 2 +#define SECONDARY_STRUCTURE_
3 -
4 -#define IL_STD
5 3
4 +#include "Motif.h"
6 #include "rna.h" 5 #include "rna.h"
7 #include <iostream> 6 #include <iostream>
8 #include <string> 7 #include <string>
...@@ -12,28 +11,6 @@ using std::pair; ...@@ -12,28 +11,6 @@ using std::pair;
12 using std::string; 11 using std::string;
13 using std::vector; 12 using std::vector;
14 13
15 -typedef struct Comp_ {
16 - pair<uint, uint> pos;
17 - size_t k;
18 - Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; }
19 -} Component;
20 -
21 -typedef struct {
22 - string atlas_id;
23 - vector<Component> comp;
24 - bool reversed;
25 - int score;
26 - string pos_string(void) const
27 - {
28 - std::stringstream s;
29 - s << atlas_id << " ( ";
30 - for (auto c : comp) {
31 - s << c.pos.first << '-' << c.pos.second << ' ';
32 - }
33 - s << ')';
34 - return s.str();
35 - }
36 -} Motif;
37 14
38 class SecondaryStructure 15 class SecondaryStructure
39 { 16 {
...@@ -56,11 +33,11 @@ class SecondaryStructure ...@@ -56,11 +33,11 @@ class SecondaryStructure
56 33
57 vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure 34 vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure
58 vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program 35 vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program
59 - vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions 36 + vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions
60 - size_t n_; // length of the RNA 37 + size_t n_; // length of the RNA
61 - size_t nBP_; // number of basepairs 38 + size_t nBP_; // number of basepairs
62 - RNA rna_; // RNA object which is folded 39 + RNA rna_; // RNA object which is folded
63 - bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore 40 + bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore
64 }; 41 };
65 42
66 // return if this SecondaryStructure s1 dominates s2 43 // return if this SecondaryStructure s1 dominates s2
...@@ -72,11 +49,7 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2); ...@@ -72,11 +49,7 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2);
72 // return wether SecondaryStructures are identical or not 49 // return wether SecondaryStructures are identical or not
73 bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2); 50 bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2);
74 bool operator!=(const SecondaryStructure& s1, const SecondaryStructure& s2); 51 bool operator!=(const SecondaryStructure& s1, const SecondaryStructure& s2);
75 -// utilities to compare secondary structures: 52 +
76 -bool operator==(const Motif& m1, const Motif& m2);
77 -bool operator!=(const Motif& m1, const Motif& m2);
78 -bool operator==(const Component& c1, const Component& c2);
79 -bool operator!=(const Component& c1, const Component& c2);
80 bool motif_sorter(Motif& m1, Motif& m2); 53 bool motif_sorter(Motif& m1, Motif& m2);
81 bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j); 54 bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j);
82 55
...@@ -85,4 +58,5 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti ...@@ -85,4 +58,5 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
85 inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } 58 inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
86 inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } 59 inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
87 60
88 -#endif 61 +
62 +#endif // SECONDARY_STRUCTURE_
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
3 ***/ 3 ***/
4 4
5 #include <algorithm> 5 #include <algorithm>
6 -#include <boost/algorithm/string.hpp>
7 #include <cstdlib> 6 #include <cstdlib>
8 #include <iostream> 7 #include <iostream>
9 #include <iterator> 8 #include <iterator>
...@@ -12,6 +11,7 @@ ...@@ -12,6 +11,7 @@
12 #include <vector> 11 #include <vector>
13 12
14 #include "MOIP.h" 13 #include "MOIP.h"
14 +#include "Motif.h"
15 #include "fa.h" 15 #include "fa.h"
16 16
17 using namespace std; 17 using namespace std;
...@@ -49,19 +49,6 @@ string remove_ext(const char* mystr, char dot, char sep) ...@@ -49,19 +49,6 @@ string remove_ext(const char* mystr, char dot, char sep)
49 return string(retstr); 49 return string(retstr);
50 } 50 }
51 51
52 -Motif parse_csv_line(string line)
53 -{
54 - vector<string> tokens;
55 - boost::split(tokens, line, boost::is_any_of(","));
56 - Motif m;
57 - m.atlas_id = tokens[0];
58 - m.score = stoi(tokens[2]);
59 - m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4]))));
60 - if (tokens[5] != "-") m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6]))));
61 - m.reversed = (tokens[1] == "True");
62 - return m;
63 -}
64 -
65 int main(int argc, char* argv[]) 52 int main(int argc, char* argv[])
66 { 53 {
67 /* ARGUMENT CHECKING */ 54 /* ARGUMENT CHECKING */
...@@ -81,8 +68,6 @@ int main(int argc, char* argv[]) ...@@ -81,8 +68,6 @@ int main(int argc, char* argv[])
81 string basename = remove_ext(inputName, '.', '/'); 68 string basename = remove_ext(inputName, '.', '/');
82 float theta_p_threshold = atof(argv[3]); 69 float theta_p_threshold = atof(argv[3]);
83 list<Fasta> f; 70 list<Fasta> f;
84 - string line;
85 - ifstream motifs;
86 vector<Motif> posInsertionSites; 71 vector<Motif> posInsertionSites;
87 ofstream outfile; 72 ofstream outfile;
88 SecondaryStructure bestSSO1, bestSSO2; 73 SecondaryStructure bestSSO1, bestSSO2;
...@@ -108,9 +93,7 @@ int main(int argc, char* argv[]) ...@@ -108,9 +93,7 @@ int main(int argc, char* argv[])
108 cerr << csvname << " not found" << endl; 93 cerr << csvname << " not found" << endl;
109 return EXIT_FAILURE; 94 return EXIT_FAILURE;
110 } 95 }
111 - motifs = ifstream(csvname); 96 + posInsertionSites = load_desc_folder(csvname);
112 - getline(motifs, line); // skip header
113 - while (getline(motifs, line)) posInsertionSites.push_back(parse_csv_line(line));
114 if (verbose) 97 if (verbose)
115 cout << "\t>" << csvname << " successfuly loaded (" << posInsertionSites.size() << " insertion sites)" << endl; 98 cout << "\t>" << csvname << " successfuly loaded (" << posInsertionSites.size() << " insertion sites)" << endl;
116 99
......