Showing
5 changed files
with
275 additions
and
82 deletions
cppsrc/Motif.cpp
0 → 100644
1 | +#include "Motif.h" | ||
2 | +#include <boost/algorithm/string.hpp> | ||
3 | +#include <boost/filesystem.hpp> | ||
4 | +#include <iostream> | ||
5 | +#include <regex> | ||
6 | +#include <sstream> | ||
7 | + | ||
8 | +using namespace boost::filesystem; | ||
9 | + | ||
10 | + | ||
11 | +struct recursive_directory_range { | ||
12 | + typedef recursive_directory_iterator iterator; | ||
13 | + recursive_directory_range(path p) : p_(p) {} | ||
14 | + | ||
15 | + iterator begin() { return recursive_directory_iterator(p_); } | ||
16 | + iterator end() { return recursive_directory_iterator(); } | ||
17 | + | ||
18 | + path p_; | ||
19 | +}; | ||
20 | + | ||
21 | + | ||
22 | +Motif::Motif(void) {} | ||
23 | + | ||
24 | + | ||
25 | +void Motif::build_from_desc(const string& descfile) | ||
26 | +{ | ||
27 | + std::ifstream motif; | ||
28 | + string line; | ||
29 | + string seq; | ||
30 | + vector<string> component_sequences; | ||
31 | + vector<string> bases; | ||
32 | + int last; | ||
33 | + | ||
34 | + PDBID = descfile.substr(0, descfile.find(".desc")); | ||
35 | + is_model_ = false; | ||
36 | + reversed_ = false; | ||
37 | + source_ = RNA3DMOTIF; | ||
38 | + | ||
39 | + motif = std::ifstream(descfile); | ||
40 | + std::getline(motif, line); // ignore "id: number" | ||
41 | + std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ... | ||
42 | + split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc... | ||
43 | + | ||
44 | + seq = bases[1].back(); | ||
45 | + last = std::stoi(bases[1].substr(0, bases[1].find('_'))); | ||
46 | + for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) { | ||
47 | + char nt = b->back(); | ||
48 | + int pos = std::stoi(b->substr(0, b->find('_'))); | ||
49 | + | ||
50 | + if (pos - last > 5) { // finish this component and start a new one | ||
51 | + component_sequences.push_back(seq); | ||
52 | + seq = nt; | ||
53 | + } else if (pos - last == 1) { // we are on the same component | ||
54 | + seq += nt; | ||
55 | + } else if (pos - last == 2) { | ||
56 | + seq += '.' + nt; | ||
57 | + } else if (pos - last == 3) { | ||
58 | + seq += ".." + nt; | ||
59 | + } else if (pos - last == 4) { | ||
60 | + seq += "..." + nt; | ||
61 | + } else if (pos - last == 5) { | ||
62 | + seq += "...." + nt; | ||
63 | + } | ||
64 | + } | ||
65 | + // Now component_sequences is a vector of sequences like {AGCGC, CGU..GUUU} | ||
66 | + for (string& comp : component_sequences) { | ||
67 | + } | ||
68 | +} | ||
69 | + | ||
70 | + | ||
71 | +void Motif::load_from_csv(string csv_line) | ||
72 | +{ | ||
73 | + vector<string> tokens; | ||
74 | + split(tokens, csv_line, boost::is_any_of(",")); | ||
75 | + atlas_id = tokens[0]; | ||
76 | + score_ = stoi(tokens[2]); | ||
77 | + comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4])))); | ||
78 | + if (tokens[5] != "-") comp.push_back(Component(std::make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6])))); | ||
79 | + reversed_ = (tokens[1] == "True"); | ||
80 | + is_model_ = true; | ||
81 | + PDBID = ""; | ||
82 | + source_ = RNAMOTIFATLAS; | ||
83 | +} | ||
84 | + | ||
85 | + | ||
86 | +string Motif::pos_string(void) const | ||
87 | +{ | ||
88 | + std::stringstream s; | ||
89 | + s << atlas_id << " ( "; | ||
90 | + for (auto c : comp) s << c.pos.first << '-' << c.pos.second << ' '; | ||
91 | + s << ')'; | ||
92 | + return s.str(); | ||
93 | +} | ||
94 | + | ||
95 | +string Motif::get_identifier(void) const | ||
96 | +{ | ||
97 | + switch (source_) { | ||
98 | + case RNAMOTIFATLAS: return atlas_id; break; | ||
99 | + default: return PDBID; | ||
100 | + } | ||
101 | +} | ||
102 | + | ||
103 | + | ||
104 | + | ||
105 | + | ||
106 | + | ||
107 | + | ||
108 | +vector<Motif> load_desc_folder(const string& path, const string& rna) | ||
109 | +{ | ||
110 | + vector<Motif> posInsertionSites; | ||
111 | + | ||
112 | + if (!exists(path)) { | ||
113 | + std::cerr << "Hmh, i can't find that folder: " << path << std::endl; | ||
114 | + return posInsertionSites; | ||
115 | + } | ||
116 | + | ||
117 | + for (auto it : recursive_directory_range(path)) { | ||
118 | + if (is_desc_insertible(it.path().string(), rna)) { | ||
119 | + posInsertionSites.push_back(Motif()); | ||
120 | + posInsertionSites.back().build_from_desc(it.path().string()); | ||
121 | + } | ||
122 | + } | ||
123 | + return posInsertionSites; | ||
124 | +} | ||
125 | + | ||
126 | +vector<Motif> load_jar3d_output(const string& path) | ||
127 | +{ | ||
128 | + vector<Motif> posInsertionSites; | ||
129 | + std::ifstream motifs; | ||
130 | + string line; | ||
131 | + | ||
132 | + motifs = std::ifstream(path); | ||
133 | + std::getline(motifs, line); // skip header | ||
134 | + while (std::getline(motifs, line)) { | ||
135 | + posInsertionSites.push_back(Motif()); | ||
136 | + posInsertionSites.back().load_from_csv(line); | ||
137 | + } | ||
138 | + return posInsertionSites; | ||
139 | +} | ||
140 | + | ||
141 | +bool is_desc_insertible(const string& descfile, const string& rna, bool verbose) | ||
142 | +{ | ||
143 | + std::ifstream motif; | ||
144 | + string line; | ||
145 | + string seq; | ||
146 | + vector<string> bases; | ||
147 | + int last; | ||
148 | + | ||
149 | + motif = std::ifstream(descfile); | ||
150 | + std::getline(motif, line); // ignore "id: number" | ||
151 | + std::getline(motif, line); // Bases: 866_G 867_G 868_G 869_G 870_U 871_A ... | ||
152 | + split(bases, line, boost::is_any_of(" ")); // get a vector of 866_G, 867_G, etc... | ||
153 | + | ||
154 | + seq = bases[1].back(); | ||
155 | + last = std::stoi(bases[1].substr(0, bases[1].find('_'))); | ||
156 | + for (vector<string>::iterator b = bases.begin() + 2; b != bases.end(); b++) { | ||
157 | + char nt = b->back(); | ||
158 | + int pos = std::stoi(b->substr(0, b->find('_'))); | ||
159 | + | ||
160 | + if (pos - last > 5) { // finish this component and start a new one | ||
161 | + seq += ".{5,}" + nt; | ||
162 | + } else if (pos - last == 1) { // we are on the same component | ||
163 | + seq += nt; | ||
164 | + } else if (pos - last == 2) { | ||
165 | + seq += "." + nt; | ||
166 | + } else if (pos - last == 3) { | ||
167 | + seq += ".." + nt; | ||
168 | + } else if (pos - last == 4) { | ||
169 | + seq += "..." + nt; | ||
170 | + } else if (pos - last == 5) { | ||
171 | + seq += "...." + nt; | ||
172 | + } | ||
173 | + last = pos; | ||
174 | + } | ||
175 | + std::smatch m; | ||
176 | + std::regex e(seq); | ||
177 | + if (std::regex_search(rna, m, e)) { | ||
178 | + if (verbose) | ||
179 | + std::cout << "Motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << " can be inserted." << std::endl; | ||
180 | + return true; | ||
181 | + } else { | ||
182 | + if (verbose) | ||
183 | + std::cout << "Ignoring motif " << descfile.substr(0, descfile.find(".desc")) << " " << seq << std::endl; | ||
184 | + return false; | ||
185 | + } | ||
186 | +} | ||
187 | + | ||
188 | + | ||
189 | +bool operator==(const Component& c1, const Component& c2) | ||
190 | +{ | ||
191 | + if (c1.pos.first != c2.pos.first) return false; | ||
192 | + if (c1.pos.second != c2.pos.second) return false; | ||
193 | + return true; | ||
194 | +} | ||
195 | + | ||
196 | +bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); } | ||
197 | + | ||
198 | + | ||
199 | + | ||
200 | +bool operator==(const Motif& m1, const Motif& m2) | ||
201 | +{ | ||
202 | + if (m1.get_identifier() != m2.get_identifier()) return false; | ||
203 | + if (m1.score_ != m2.score_) return false; | ||
204 | + if (m1.reversed_ != m2.reversed_) return false; | ||
205 | + for (uint i = 0; i < m1.comp.size(); i++) | ||
206 | + if (m1.comp[i] != m2.comp[i]) return false; | ||
207 | + return true; | ||
208 | +} | ||
209 | + | ||
210 | +bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); } |
cppsrc/Motif.h
0 → 100644
1 | +#ifndef MOTIF_H_ | ||
2 | +#define MOTIF_H_ | ||
3 | + | ||
4 | +#include <string> | ||
5 | +#include <vector> | ||
6 | + | ||
7 | +using std::pair; | ||
8 | +using std::string; | ||
9 | +using std::vector; | ||
10 | + | ||
11 | +typedef struct Comp_ { | ||
12 | + pair<uint, uint> pos; | ||
13 | + size_t k; | ||
14 | + string seq_; | ||
15 | + Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; } | ||
16 | +} Component; | ||
17 | + | ||
18 | + | ||
19 | + | ||
20 | +class Motif | ||
21 | +{ | ||
22 | + public: | ||
23 | + Motif(); | ||
24 | + void load_from_csv(string csv_line); | ||
25 | + void build_from_desc(const string& descfile); | ||
26 | + string pos_string(void) const; | ||
27 | + string get_origin(void) const; | ||
28 | + string get_identifier(void) const; | ||
29 | + vector<Component> comp; | ||
30 | + double score_; | ||
31 | + bool reversed_; | ||
32 | + | ||
33 | + private: | ||
34 | + string atlas_id; // if source = RNAMOTIFATLAS | ||
35 | + string PDBID; // if source = RNA3DMOTIF | ||
36 | + bool is_model_; // Wether the motif is a model or an extracted module from a 3D structure | ||
37 | + enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3 } source_; | ||
38 | +}; | ||
39 | + | ||
40 | +bool is_desc_insertible(const string& descfile, const string& rna); | ||
41 | +vector<Motif> load_desc_folder(const string& path); | ||
42 | +vector<Motif> load_jar3d_output(const string& path); | ||
43 | + | ||
44 | +// utilities to compare secondary structures: | ||
45 | +bool operator==(const Motif& m1, const Motif& m2); | ||
46 | +bool operator!=(const Motif& m1, const Motif& m2); | ||
47 | +bool operator==(const Component& c1, const Component& c2); | ||
48 | +bool operator!=(const Component& c1, const Component& c2); | ||
49 | + | ||
50 | +#endif // MOTIF_H_ | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -104,7 +104,7 @@ string SecondaryStructure::to_string(void) const | ... | @@ -104,7 +104,7 @@ string SecondaryStructure::to_string(void) const |
104 | { | 104 | { |
105 | string s; | 105 | string s; |
106 | s += to_DBN(); | 106 | s += to_DBN(); |
107 | - for (const Motif& m : motif_info_) s += " + " + m.atlas_id; | 107 | + for (const Motif& m : motif_info_) s += " + " + m.get_identifier(); |
108 | s += "\t" + boost::str(boost::format("%.7f") % objective_scores_[0]) + "\t" + | 108 | s += "\t" + boost::str(boost::format("%.7f") % objective_scores_[0]) + "\t" + |
109 | boost::str(boost::format("%.7f") % objective_scores_[1]); | 109 | boost::str(boost::format("%.7f") % objective_scores_[1]); |
110 | return s; | 110 | return s; |
... | @@ -164,7 +164,7 @@ bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j) | ... | @@ -164,7 +164,7 @@ bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j) |
164 | 164 | ||
165 | bool motif_sorter(Motif& m1, Motif& m2) | 165 | bool motif_sorter(Motif& m1, Motif& m2) |
166 | { | 166 | { |
167 | - if (m1.atlas_id.compare(m2.atlas_id) < 0) return true; | 167 | + if (m1.get_identifier().compare(m2.get_identifier()) < 0) return true; |
168 | return false; | 168 | return false; |
169 | } | 169 | } |
170 | 170 | ||
... | @@ -282,30 +282,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2) | ... | @@ -282,30 +282,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2) |
282 | return false; | 282 | return false; |
283 | } | 283 | } |
284 | 284 | ||
285 | -bool operator==(const Component& c1, const Component& c2) | ||
286 | -{ | ||
287 | - if (c1.pos.first != c2.pos.first) return false; | ||
288 | - if (c1.pos.second != c2.pos.second) return false; | ||
289 | - return true; | ||
290 | -} | ||
291 | - | ||
292 | -bool operator!=(const Component& c1, const Component& c2) { return not(c1 == c2); } | ||
293 | - | ||
294 | - | ||
295 | - | ||
296 | -bool operator==(const Motif& m1, const Motif& m2) | ||
297 | -{ | ||
298 | - if (m1.atlas_id != m2.atlas_id) return false; | ||
299 | - if (m1.score != m2.score) return false; | ||
300 | - if (m1.reversed != m2.reversed) return false; | ||
301 | - for (uint i = 0; i < m1.comp.size(); i++) | ||
302 | - if (m1.comp[i] != m2.comp[i]) return false; | ||
303 | - return true; | ||
304 | -} | ||
305 | - | ||
306 | -bool operator!=(const Motif& m1, const Motif& m2) { return not(m1 == m2); } | ||
307 | - | ||
308 | - | ||
309 | 285 | ||
310 | bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2) | 286 | bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2) |
311 | { | 287 | { | ... | ... |
1 | -#ifndef __INC_IP_SOL__ | 1 | +#ifndef SECONDARY_STRUCTURE_ |
2 | -#define __INC_IP_SOL__ | 2 | +#define SECONDARY_STRUCTURE_ |
3 | - | ||
4 | -#define IL_STD | ||
5 | 3 | ||
4 | +#include "Motif.h" | ||
6 | #include "rna.h" | 5 | #include "rna.h" |
7 | #include <iostream> | 6 | #include <iostream> |
8 | #include <string> | 7 | #include <string> |
... | @@ -12,28 +11,6 @@ using std::pair; | ... | @@ -12,28 +11,6 @@ using std::pair; |
12 | using std::string; | 11 | using std::string; |
13 | using std::vector; | 12 | using std::vector; |
14 | 13 | ||
15 | -typedef struct Comp_ { | ||
16 | - pair<uint, uint> pos; | ||
17 | - size_t k; | ||
18 | - Comp_(pair<int, int> p) : pos(p) { k = 1 + pos.second - pos.first; } | ||
19 | -} Component; | ||
20 | - | ||
21 | -typedef struct { | ||
22 | - string atlas_id; | ||
23 | - vector<Component> comp; | ||
24 | - bool reversed; | ||
25 | - int score; | ||
26 | - string pos_string(void) const | ||
27 | - { | ||
28 | - std::stringstream s; | ||
29 | - s << atlas_id << " ( "; | ||
30 | - for (auto c : comp) { | ||
31 | - s << c.pos.first << '-' << c.pos.second << ' '; | ||
32 | - } | ||
33 | - s << ')'; | ||
34 | - return s.str(); | ||
35 | - } | ||
36 | -} Motif; | ||
37 | 14 | ||
38 | class SecondaryStructure | 15 | class SecondaryStructure |
39 | { | 16 | { |
... | @@ -56,11 +33,11 @@ class SecondaryStructure | ... | @@ -56,11 +33,11 @@ class SecondaryStructure |
56 | 33 | ||
57 | vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure | 34 | vector<double> objective_scores_; // values of the different objective functions for that SecondaryStructure |
58 | vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program | 35 | vector<pair<uint, uint>> basepairs_; // values of the decision variable of the integer program |
59 | - vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions | 36 | + vector<Motif> motif_info_; // information about known motives in this secondary structure and their positions |
60 | - size_t n_; // length of the RNA | 37 | + size_t n_; // length of the RNA |
61 | - size_t nBP_; // number of basepairs | 38 | + size_t nBP_; // number of basepairs |
62 | - RNA rna_; // RNA object which is folded | 39 | + RNA rna_; // RNA object which is folded |
63 | - bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore | 40 | + bool is_empty_structure; // Empty structure, returned when the solver does not find solutions anymore |
64 | }; | 41 | }; |
65 | 42 | ||
66 | // return if this SecondaryStructure s1 dominates s2 | 43 | // return if this SecondaryStructure s1 dominates s2 |
... | @@ -72,11 +49,7 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2); | ... | @@ -72,11 +49,7 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2); |
72 | // return wether SecondaryStructures are identical or not | 49 | // return wether SecondaryStructures are identical or not |
73 | bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2); | 50 | bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2); |
74 | bool operator!=(const SecondaryStructure& s1, const SecondaryStructure& s2); | 51 | bool operator!=(const SecondaryStructure& s1, const SecondaryStructure& s2); |
75 | -// utilities to compare secondary structures: | 52 | + |
76 | -bool operator==(const Motif& m1, const Motif& m2); | ||
77 | -bool operator!=(const Motif& m1, const Motif& m2); | ||
78 | -bool operator==(const Component& c1, const Component& c2); | ||
79 | -bool operator!=(const Component& c1, const Component& c2); | ||
80 | bool motif_sorter(Motif& m1, Motif& m2); | 53 | bool motif_sorter(Motif& m1, Motif& m2); |
81 | bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j); | 54 | bool basepair_sorter(pair<uint, uint>& i, pair<uint, uint>& j); |
82 | 55 | ||
... | @@ -85,4 +58,5 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti | ... | @@ -85,4 +58,5 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti |
85 | inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } | 58 | inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); } |
86 | inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } | 59 | inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; } |
87 | 60 | ||
88 | -#endif | 61 | + |
62 | +#endif // SECONDARY_STRUCTURE_ | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -3,7 +3,6 @@ | ... | @@ -3,7 +3,6 @@ |
3 | ***/ | 3 | ***/ |
4 | 4 | ||
5 | #include <algorithm> | 5 | #include <algorithm> |
6 | -#include <boost/algorithm/string.hpp> | ||
7 | #include <cstdlib> | 6 | #include <cstdlib> |
8 | #include <iostream> | 7 | #include <iostream> |
9 | #include <iterator> | 8 | #include <iterator> |
... | @@ -12,6 +11,7 @@ | ... | @@ -12,6 +11,7 @@ |
12 | #include <vector> | 11 | #include <vector> |
13 | 12 | ||
14 | #include "MOIP.h" | 13 | #include "MOIP.h" |
14 | +#include "Motif.h" | ||
15 | #include "fa.h" | 15 | #include "fa.h" |
16 | 16 | ||
17 | using namespace std; | 17 | using namespace std; |
... | @@ -49,19 +49,6 @@ string remove_ext(const char* mystr, char dot, char sep) | ... | @@ -49,19 +49,6 @@ string remove_ext(const char* mystr, char dot, char sep) |
49 | return string(retstr); | 49 | return string(retstr); |
50 | } | 50 | } |
51 | 51 | ||
52 | -Motif parse_csv_line(string line) | ||
53 | -{ | ||
54 | - vector<string> tokens; | ||
55 | - boost::split(tokens, line, boost::is_any_of(",")); | ||
56 | - Motif m; | ||
57 | - m.atlas_id = tokens[0]; | ||
58 | - m.score = stoi(tokens[2]); | ||
59 | - m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[3]), stoi(tokens[4])))); | ||
60 | - if (tokens[5] != "-") m.comp.push_back(Component(make_pair<int, int>(stoi(tokens[5]), stoi(tokens[6])))); | ||
61 | - m.reversed = (tokens[1] == "True"); | ||
62 | - return m; | ||
63 | -} | ||
64 | - | ||
65 | int main(int argc, char* argv[]) | 52 | int main(int argc, char* argv[]) |
66 | { | 53 | { |
67 | /* ARGUMENT CHECKING */ | 54 | /* ARGUMENT CHECKING */ |
... | @@ -81,8 +68,6 @@ int main(int argc, char* argv[]) | ... | @@ -81,8 +68,6 @@ int main(int argc, char* argv[]) |
81 | string basename = remove_ext(inputName, '.', '/'); | 68 | string basename = remove_ext(inputName, '.', '/'); |
82 | float theta_p_threshold = atof(argv[3]); | 69 | float theta_p_threshold = atof(argv[3]); |
83 | list<Fasta> f; | 70 | list<Fasta> f; |
84 | - string line; | ||
85 | - ifstream motifs; | ||
86 | vector<Motif> posInsertionSites; | 71 | vector<Motif> posInsertionSites; |
87 | ofstream outfile; | 72 | ofstream outfile; |
88 | SecondaryStructure bestSSO1, bestSSO2; | 73 | SecondaryStructure bestSSO1, bestSSO2; |
... | @@ -108,9 +93,7 @@ int main(int argc, char* argv[]) | ... | @@ -108,9 +93,7 @@ int main(int argc, char* argv[]) |
108 | cerr << csvname << " not found" << endl; | 93 | cerr << csvname << " not found" << endl; |
109 | return EXIT_FAILURE; | 94 | return EXIT_FAILURE; |
110 | } | 95 | } |
111 | - motifs = ifstream(csvname); | 96 | + posInsertionSites = load_desc_folder(csvname); |
112 | - getline(motifs, line); // skip header | ||
113 | - while (getline(motifs, line)) posInsertionSites.push_back(parse_csv_line(line)); | ||
114 | if (verbose) | 97 | if (verbose) |
115 | cout << "\t>" << csvname << " successfuly loaded (" << posInsertionSites.size() << " insertion sites)" << endl; | 98 | cout << "\t>" << csvname << " successfuly loaded (" << posInsertionSites.size() << " insertion sites)" << endl; |
116 | 99 | ... | ... |
-
Please register or login to post a comment