extract_probing_data.cpp 11.4 KB

#include <fstream>
#include <iostream>
#include <regex>
#include <sys/stat.h>
#include <boost/algorithm/string.hpp>
#include <algorithm>
#include <numeric>
#include <math.h>
#include <float.h>

#include "extract_probing_data.h"

// SHAPE, DMS or PARS data
void extract_probing(const std::string PROBINGfile, std::vector < Rna >& rnaList_)
{
    std::string name = "", indexS = "", dataType = "", react1S = "", react2S = "", line = "";
    char base;
    std::vector< std::string > elements;
    size_t size, j, k, size2;
    int nLine = 0;
    unsigned int index;
    float react1, react2;

    int i; // RNA index

    // Initialize probing data
    std::vector < std::vector < float > > probingData, probingData2;
    for (j = 0, size = rnaList_.size(); j != size; j++) {
        probingData.push_back(std::vector < float > (rnaList_[j].get_n_(), -1.0));
        probingData2.push_back(std::vector < float > (rnaList_[j].get_n_(), -1.0));
    }

    struct stat buf;
    if( (stat(PROBINGfile.c_str(), &buf) == 0)) {
        nLine = 1;
        std::ifstream ifs(PROBINGfile);
        while (std::getline(ifs, line)) {
            if (line[0] != '\n' and line != "") {
                // check the format
                elements.clear();
                boost::split( elements, line, boost::is_any_of(" \t"), boost::token_compress_on );

                /*std::cout << "line " << nLine << ": ";
                for(j = 0, size = elements.size(); j != size; j++)
                    std::cout << elements[j] << " ";
                std::cout << std::endl;*/

                //tab = std::count(line.begin(), line.end(), '\t');
                if(uint(elements.size()) >= 3)
                // Name \space Index \space Base (begin at 1) \space DataType \space Reactivity1 (Shape/S1/DMS) \space Reactivity2 (empty/V1/control)
                {
                    // Check RNA index
                    name = elements[0];
                    i = -1;
                    for (j = 0, size = rnaList_.size(); j != size; j++)
                        if (name.compare(rnaList_[j].get_name_()) == 0)
                            i = j;
                    if (i != -1) { // RNA index is ok
                        indexS = elements[1];
                        base = toupper(elements[2][0]);
                        if(base == 'T')
                            base = 'U';
                        dataType = elements[3];
                        if(dataType.compare("SHAPE") == 0 or dataType.compare("PARS") == 0 or dataType.compare("DMS") == 0){
                            if (dataType.compare("SHAPE") == 0 and uint(elements.size()) >= 5) {
                                react1S = elements[4];
                                try {
                                    index = std::stoul(indexS);
                                    react1 = std::stof(react1S);

                                    //Check index
                                    if (index > rnaList_[i].get_n_()) { // 1-based index
                                        throw (std::string("Wrong index in line " + std::to_string(nLine) + " of probing file."));
                                    } else {
                                        //Check base
                                        if (base != rnaList_[i].get_seq_()[index-1]) {
                                            throw (std::string("Wrong base in line " + std::to_string(nLine) + " of probing file."));
                                        } else {
                                            // Save data
                                            probingData[i][index-1] = react1;
                                        }
                                    }
                                } catch (const std::invalid_argument) {
                                    std::cout << "Warning: wrong probing data in line " << nLine << " of probing file." << std::endl;
                                }
                            } else if ((dataType.compare("PARS") == 0 or dataType.compare("DMS") == 0 ) and uint(elements.size()) >= 6){
                                react1S = elements[4];
                                react2S = elements[5];
                                try {
                                    index = std::stoul(indexS);
                                    react1 = std::stof(react1S);
                                    react2 = std::stof(react2S);

                                    //Check index
                                    if (index > rnaList_[i].get_n_()) { // 1-based index
                                        throw (std::string("Wrong index in line " + std::to_string(nLine) + " of probing file."));
                                    } else {
                                        //Check base
                                        if (base != rnaList_[i].get_seq_()[index-1]) {
                                            throw (std::string("Wrong base in line " + std::to_string(nLine) + " of probing file."));
                                        } else {
                                            // Save data
                                            probingData[i][index-1] = react1;
                                            probingData2[i][index-1] = react2;
                                        }
                                    }
                                } catch (const std::invalid_argument) {
                                    std::cout << "Warning: wrong probing data in line " << nLine << " of probing file." << std::endl;
                                }
                            } else {
                                throw (std::string("The format of the probing file is not correct. Please look at the example inputs."));
                            }
                        } else {
                            throw (std::string(dataType + " is not a correct data type for the probing file, choose between : PARS, DMS or SHAPE."));
                        }
                    } else {
                        throw (std::string("The RNA " + name + " does not exist in the fasta file. Found in probing file line " +
                                std::to_string(nLine) + ". Please homogenize the names."));
                    }
                } else {
                    throw (std::string("The format of the probing file is not correct. Please look at the example inputs."));
                }
            }
            nLine++;
        } // End of reading file

        float min, max, sum1, sum2;
        int l;
        // Normalize data
        for (j = 0, size = probingData.size(); j != size; j++) {
#ifdef _DEBUG
            std::cout << "Probing ARN " << j << std::endl;
#endif
            if (std::accumulate(probingData2[j].begin(), probingData2[j].end(), 0) == -1*int(probingData2[j].size())
                    and std::accumulate(probingData[j].begin(), probingData[j].end(), 0) != -1*int(probingData[j].size())) { // SHAPE data, linear normalization
#ifdef _DEBUG
                std::cout << "shape" << std::endl;
#endif
                min = FLT_MAX;
                max = FLT_MIN;
                for (k = 0, size2 = probingData[j].size(); k != size2; k++) {

#ifdef _DEBUG
                    std::cout << probingData[j][k] << " ";
#endif
                    if (std::abs(probingData[j][k]+1.0) > 0.00001) {
                        if (probingData[j][k] < min)
                            min = probingData[j][k];
                        if (probingData[j][k] > max)
                            max = probingData[j][k];
                    }
                }
#ifdef _DEBUG
                std::cout << std::endl;
#endif
                for (k = 0, size2 = probingData[j].size(); k != size2; k++) {
                    if(std::abs(probingData[j][k]+1.0) > 0.00001)
                        probingData[j][k] = (probingData[j][k] - min) /  (max - min);
                }
            } else if (std::accumulate(probingData[j].begin(), probingData[j].end(), 0) != -1*int(probingData[j].size()))
            { // PARS or DMS data, normalization in function of the abundance and the length of the transcript
#ifdef _DEBUG
                std::cout  << "pars" << std::endl;
#endif
                l = int(rnaList_[j].get_n_());
                sum1 = 0;
                for (k = 0, size2 = probingData[j].size(); k != size2; k++) {
                    if(std::abs(probingData[j][k]+1.0) > 0.00001) {
                        probingData[j][k] = log(probingData[j][k] + 1);
                        sum1 += probingData[j][k];
                    }
                }
                for (k = 0, size2 = probingData[j].size(); k != size2; k++) {
                    if(std::abs(probingData[j][k]+1.0) > 0.00001)
                        probingData[j][k] = probingData[j][k] / (sum1/l);
                }
                sum2 = 0;
                for (k = 0, size2 = probingData2[j].size(); k != size2; k++) {
                    if(std::abs(probingData2[j][k]+1.0) > 0.00001) {
                        probingData2[j][k] = log(probingData2[j][k] + 1);
                        sum2 += probingData2[j][k];
                    }
                }
                for (k = 0, size2 = probingData2[j].size(); k != size2; k++) {
                    if(std::abs(probingData2[j][k]+1.0) > 0.00001)
                        probingData2[j][k] = probingData2[j][k] / (sum2/l);
                }
                for (k = 0, size2 = probingData2[j].size(); k != size2; k++) {
                    if(std::abs(probingData[j][k]+1.0) > 0.00001 and std::abs(probingData2[j][k]+1.0) > 0.00001) {
                        probingData[j][k] = probingData[j][k] - probingData2[j][k];

                    } else if (std::abs(probingData[j][k]+1.0) > 0.00001) {
                        probingData[j][k] = -1;
                    }
#ifdef _DEBUG
                    std::cout << probingData[j][k] << " ";
#endif
                }
#ifdef _DEBUG
                std::cout << std::endl;
#endif

                // Then linear normalization to range the data between 0 and 1
                if (std::accumulate(probingData[j].begin(), probingData[j].end(), 0) != -1*int(probingData[j].size())) {
#ifdef _DEBUG
                    std::cout << "Normalisation linéaire" << std::endl;
#endif
                    min = FLT_MAX;
                    max = FLT_MIN;
                    for (k = 0, size2 = probingData[j].size(); k != size2; k++) {
                        if (std::abs(probingData[j][k]+1.0) > 0.00001) {
                            if (probingData[j][k] < min)
                                min = probingData[j][k];
                            if (probingData[j][k] > max)
                                max = probingData[j][k];
                        }
                    }
                    for (k = 0, size2 = probingData[j].size(); k != size2; k++) {
                        if(std::abs(probingData[j][k]+1.0) > 0.00001)
                            probingData[j][k] = (probingData[j][k] - min) /  (max - min);
                        std::cout << k << " " << probingData[j][k] << "\n";
#ifdef _DEBUG
                        std::cout << probingData[j][k] << " ";
#endif
                    }
                    std::cout << std::endl;
#ifdef _DEBUG
                    std::cout << std::endl;
#endif
                }
            }
        }
        // Add probing data to the RNA
        for(j = 0, size = rnaList_.size(); j != size; j++)
            rnaList_[j].set_probingData_(probingData[j]);
    }
    else
    {
        throw(std::string("The file " + PROBINGfile + " doesn't exist."));
    }
}