Louis BECQUEY

Big cleanup for JSON format support

This diff is collapsed. Click to expand it.
......@@ -40,12 +40,9 @@ re: remove clean all
.PHONY: clean
clean:
$(rm) $(OBJECTS)
$(rm) doc/supplementary_material.bbl doc/supplementary_material.blg doc/supplementary_material.synctex.gz doc/supplementary_material.log doc/supplementary_material.aux
$(rm) doc/main_bioinformatics.bbl doc/main_bioinformatics.blg doc/main_bioinformatics.synctex.gz doc/main_bioinformatics.log doc/main_bioinformatics.aux doc/OUP_First_SBk_Bot_8401-eps-converted-to.pdf
@echo -e "\033[00;32mCleanup completed.\033[00m"
.PHONY: remove
remove:
@$(rm) $(BINDIR)/$(TARGET)
@$(rm) doc/main_bioinformatics.pdf doc/supplementary_material.pdf
@echo -e "\033[00;32mExecutable and docs removed!\033[00m"
......
This diff is collapsed. Click to expand it.
......@@ -12,12 +12,19 @@
using std::vector;
typedef struct args_ {
typedef struct argsf_ {
path motif_file;
std::mutex& posInsertionSites_mutex;
args_(path motif_file_, mutex& mutex_) : motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
} args_of_parallel_func;
argsf_(path motif_file_, mutex& mutex_)
: motif_file(motif_file_), posInsertionSites_mutex(mutex_) {}
} file_and_mutex;
typedef struct argsm_ {
json_elem motif;
std::mutex& posInsertionSites_mutex;
argsm_(json_elem& motif_, mutex& mutex_)
: motif(motif_), posInsertionSites_mutex(mutex_) {}
} motif_and_mutex;
class MOIP
{
......@@ -56,9 +63,9 @@ class MOIP
bool exists_vertical_outdated_labels(const SecondaryStructure& s) const;
bool exists_horizontal_outdated_labels(const SecondaryStructure& s) const;
void allowed_motifs_from_desc(args_of_parallel_func arg_struct);
void allowed_motifs_from_rin(args_of_parallel_func arg_struct);
void allowed_motifs_from_json(args_of_parallel_func arg_struct, vector<pair<uint, char>> errors_id);
void allowed_motifs_from_desc(file_and_mutex arg_struct);
void allowed_motifs_from_rin(file_and_mutex arg_struct);
void allowed_motifs_from_json(motif_and_mutex arg_struct);
bool verbose_; // Should we print things ?
......@@ -79,8 +86,7 @@ class MOIP
vector<vector<size_t>> index_of_Cxip_; // Stores the indexes of the Cxip in insertion_dv_
vector<size_t> index_of_first_components; // Stores the indexes of Cx1p in insertion_dv_
vector<vector<size_t>> index_of_yuv_; // Stores the indexes of the y^u_v in basepair_dv_
vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
vector<vector<size_t>> index_of_xij_; //Stores the indexes of the xij variables (BioKop) in stacks_dv_
};
inline uint MOIP::get_n_solutions(void) const { return pareto_.size(); }
......
This diff is collapsed. Click to expand it.
......@@ -7,6 +7,7 @@
#include <vector>
#include <filesystem>
#include "rna.h"
#include "json.hpp"
using boost::filesystem::path;
using std::pair;
......@@ -14,6 +15,8 @@ using std::string;
using std::vector;
using std::mutex;
typedef enum { RNA3DMOTIF = 1, CSV = 2, CARNAVAL = 3, JSON = 4 } source_type;
typedef nlohmann::detail::iter_impl<nlohmann::basic_json<> > json_elem;
typedef struct Comp_ {
......@@ -39,18 +42,17 @@ typedef struct Link
class Motif
{
public:
public:
Motif(void);
Motif(string csv_line);
Motif(const vector<Component>& v, string PDB);
Motif(const vector<Component>& v, string id, size_t contacts, double tx_occurrences);
Motif(const vector<Component>& v, string name);
Motif(const vector<Component>& v, string name, string& struc);
Motif(const vector<Component>& v, path rinfile, uint id, bool reversed);
// Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
Motif(string path, int id); //full path to biorseo/data/modules/RIN/Subfiles/
static char is_valid_RIN(const string& rinfile);
static char is_valid_DESC(const string& descfile);
static vector<pair<uint,char>> is_valid_JSON(const string& jsonfile);
static char is_valid_RIN(const string& rinfile);
static char is_valid_DESC(const string& descfile);
static char is_valid_JSON(const json_elem& i);
string pos_string(void) const;
string sec_struct(void) const;
......@@ -64,39 +66,27 @@ class Motif
double tx_occurrences_;
double score_;
bool reversed_;
private:
string carnaval_id; // if source = CARNAVAL
string atlas_id; // if source = RNAMOTIFATLAS
string PDBID; // if source = RNA3DMOTIF
string contacts_id; // if source = CONTACTS
bool is_model_; // Whether the motif is a model or an extracted module from a 3D structure
enum { RNA3DMOTIF = 1, RNAMOTIFATLAS = 2, CARNAVAL = 3, CONTACTS = 4 } source_;
static uint delay;
// delay is the minimal shift between end of a component and begining of the next.
// For regular loop motifs, it should be at least 5 (because hairpins cannot be of size smaller than 5).
// For the general case, it could be zero, but solutions will look dirty...
// Higher values reduce combinatorial explosion of potential insertion sites.
private:
string id_;
source_type source_;
};
bool is_desc_insertible(const string& descfile, const string& rna);
bool is_rin_insertible(const string& rinfile, const string& rna);
bool is_json_insertible(const string& jsonfile, const string& rna);
bool check_motif_ss(string);
bool check_motif_sequence(string);
vector<Motif> load_txt_folder(const string& path, const string& rna, bool verbose);
vector<Motif> load_desc_folder(const string& path, const string& rna, bool verbose);
vector<Motif> load_csv(const string& path);
vector<Motif> load_json_folder(const string& path, const string& rna, bool verbose);
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string>& vc);
vector<vector<Component>> json_find_next_ones_in(string rna, uint offset, vector<string>& vc);
// utilities for Json motifs
size_t count_nucleotide(string&);
size_t count_delimiter(string&);
size_t count_contacts(string&);
string check_motif_sequence(string);
bool checkSecondaryStructure(string);
vector<Link> build_motif_pairs(string&, vector<Component>&);
uint find_max_occurrences(string&);
uint find_max_sequence(string&);
vector<string> find_components(string&, string);
vector<uint> find_contacts(vector<string>&, vector<Component>&);
vector<vector<Component>> find_next_ones_in(string rna, uint offset, vector<string> vc);
// utilities to compare secondary structures:
bool operator==(const Motif& m1, const Motif& m2);
......
......@@ -3,16 +3,12 @@
#include <algorithm>
#include <boost/format.hpp>
#define RESET "\033[0m"
#define RED "\033[31m" /* Red */
using std::abs;
using std::cout;
using std::endl;
SecondaryStructure::SecondaryStructure() {}
SecondaryStructure::SecondaryStructure(const RNA& rna)
: objective_scores_(vector<double>(2)), n_(rna.get_RNA_length()), nBP_(0), rna_(rna)
{
......@@ -21,8 +17,6 @@ SecondaryStructure::SecondaryStructure(const RNA& rna)
SecondaryStructure::SecondaryStructure(bool empty) : rna_(RNA()) { is_empty_structure = empty; }
string SecondaryStructure::to_DBN(void) const
{
......@@ -100,26 +94,6 @@ string SecondaryStructure::to_DBN(void) const
return res;
}
string structure_with_contacts(const SecondaryStructure& ss) {
string sequence = ss.rna_.get_seq();
string construct = "";
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : ss.motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
construct += "*";
} else {
construct += ".";
}
}
return construct;
}
string SecondaryStructure::to_string(void) const
{
string s;
......@@ -141,35 +115,11 @@ void SecondaryStructure::set_basepair(uint i, uint j)
void SecondaryStructure::insert_motif(const Motif& m) { motif_info_.push_back(m); }
void colored_contacts(string sequence, vector<Motif> motif_info_) {
bool flag;
for (uint i = 0; i < sequence.size(); i++) {
flag = false;
for (const Motif& m : motif_info_) {
for (uint j = 0; j < m.pos_contacts.size(); j++) {
if (m.pos_contacts[j] == i) flag = true;
}
}
if (flag) {
cout << RED << sequence[i] << RESET;
} else {
cout << sequence[i];
}
}
}
void SecondaryStructure::print(void) const
{
cout << endl;
cout << '\t';
colored_contacts(rna_.get_seq(), motif_info_);
//rna_.get_seq()
cout << endl;
cout << endl << '\t' << rna_.get_seq() << endl;
string ss = to_string();
cout << '\t';
colored_contacts(ss, motif_info_);
//cout << ss;
cout << endl;
cout << '\t' << ss << endl;
for (const Motif& m : motif_info_) {
uint i = 0;
cout << '\t';
......@@ -324,7 +274,6 @@ bool operator<=(const SecondaryStructure& s1, const SecondaryStructure& s2)
return false;
}
bool operator==(const SecondaryStructure& s1, const SecondaryStructure& s2)
{
// Checks wether the secondary structures are exactly the same, including the inserted motifs.
......
......@@ -57,7 +57,4 @@ inline void SecondaryStructure::set_objective_score(int i, double s) { objecti
inline uint SecondaryStructure::get_n_motifs(void) const { return motif_info_.size(); }
inline uint SecondaryStructure::get_n_bp(void) const { return nBP_; }
string structure_with_contacts(const SecondaryStructure& ss);
#endif // SECONDARY_STRUCTURE_
\ No newline at end of file
......
......@@ -79,12 +79,10 @@ int main(int argc, char* argv[])
("jsonfolder,j", po::value<string>(&motifs_path_name), "A folder containing a custom motif library in .json format")
("pre-placed,x", po::value<string>(&motifs_path_name), "A CSV file providing motif insertion sites obtained with another tool.")
("function,f", po::value<char>(&obj_function_nbr)->default_value('B'),
"(A, B, C, D, E or F) Objective function to score module insertions:\n"
"(A, B, C, or D) Objective function to score module insertions:\n"
" (A) insert big modules\n (B) light, high-order modules\n"
" (C) well-scored modules\n (D) light, high-order, well-scored\n modules\n"
" (E, F) insert big modules with many\n contacts with proteins, different\n ponderations.\n"
" C and D require position scores\n provided by --pre-placed.\n"
" E and F require protein-contact\n information and should be\n used only with --jsonfolder.")
" C and D require position scores\n provided by --pre-placed.\n")
("mfe,E", "Minimize stacking energies\n (leads to MFE extimator)")
("mea,A", "(default) Maximize expected accuracy\n (leads to MEA estimator)")
("first-objective,c", po::value<unsigned int>(&MOIP::obj_to_solve_)->default_value(2),
......@@ -108,8 +106,8 @@ int main(int argc, char* argv[])
<< "Bio-objective integer linear programming framework to predict RNA secondary structures by including known RNA modules." << endl
<< "Developped by Louis Becquey, 2018-2021\nLénaïc Durand, 2019\nNathalie Bernard, 2021" << endl << endl
<< "Usage:\tYou must provide:\n\t1) a FASTA input file with -s," << endl
<< "\t2) a module type with --rna3dmotifs, --carnaval, --contacts or --pre-placed," << endl
<< "\t3) one module-based scoring function with --func A, B, C, D, E or F," << endl
<< "\t2) a module type with --rna3dmotifs, --carnaval, --json or --pre-placed," << endl
<< "\t3) one module-based scoring function with --func A, B, C, or D" << endl
<< "\t4) one energy-based scoring function with --mfe or --mea," << endl
<< "\t5) how to display results: in console (-v), or in a result file (-o)." << endl
<< endl
......@@ -154,19 +152,22 @@ int main(int argc, char* argv[])
return EXIT_FAILURE;
}
/* FIND PARETO SET */
string source;
Motif::delay = 1;
if (vm.count("rinfolder"))
source = "rinfolder";
else if (vm.count("descfolder"))
else if (vm.count("descfolder")) {
source = "descfolder";
Motif::delay = 5;
}
else if (vm.count("jsonfolder"))
source = "jsonfolder";
else if (vm.count("pre-placed"))
source = "csvfile";
else
cerr << "ERR: no source of modules provided !" << endl;
/* FIND PARETO SET */
MOIP myMOIP = MOIP(myRNA, source, motifs_path_name.c_str(), theta_p_threshold, verbose);
double min, max;
......@@ -243,11 +244,8 @@ int main(int argc, char* argv[])
outfile.open(outputName);
outfile << fa->name() << endl << fa->seq() << endl;
for (uint i = 0; i < myMOIP.get_n_solutions(); i++) {
outfile << myMOIP.solution(i).to_string() << endl << structure_with_contacts(myMOIP.solution(i)) << endl;
string str1 = myMOIP.solution(i).to_string();
}
for (uint i = 0; i < myMOIP.get_n_solutions(); i++)
outfile << myMOIP.solution(i).to_string() << endl;
outfile.close();
}
......
No preview for this file type
......@@ -50,6 +50,7 @@ RNA::RNA(string name, string seq, bool verbose)
int max_bp_span = 100;
float cutoff = 1e-6;
vrna_ep_t* results = vrna_pfl_fold(cseq, window_size, max_bp_span, cutoff);
vrna_ep_t* save = results; // keep the pointer to free it later
if (results != NULL)
{
......@@ -95,7 +96,9 @@ RNA::RNA(string name, string seq, bool verbose)
}
}
}
// Free memory allocated by ViennaRNA
free(save);
}
else cerr << "NULL result returned by vrna_pfl_fold" << endl;
......