translate_FUNCTION.cpp 9.58 KB
#include <fstream>
#include <iostream>
#include <string>
#include "genetic_codes.h" //using namespace std;
#include <getopt.h>
#include <stdlib.h> // system() and atoi()
#include <algorithm>
#include <vector>
#include "split.h"
#include "alternate_codes.h"

// system detection
// docker


/* Global variables */
vector<int> multiFASTA_seqlen;
map<string, string> genetic_code;
vector<string> sCodons;
int minimum_length = 2; // dipeptides


// XXX PROTOTYPES

vector<string> complement_sCodons(vector<string> vec_codons, bool rna){
	vector<string> cvec_codons;
	for (string sCodon : vec_codons){
		string codonCompl;
		for (char& c : sCodon){
			if (c == 'A'){
				(rna) ? codonCompl+='U' : codonCompl+='T';
			}
			if (c == 'C'){
				codonCompl+='G';
			}
			if (c == 'G'){
				codonCompl+='C';
			}
			if (c == 'T'){
				codonCompl+='A';
			}
		}
		cvec_codons.push_back(codonCompl);
	}
	return cvec_codons;
}


void extend_until_print(string &codon, string &protein, string &nucleic, int frame, char posneg, ofstream &outfile, ofstream &outfileDNA, bool backward, int position, int glob_vec_index){
	if (codon.length() == 3){
		if (genetic_code.count(codon)){
			if (genetic_code[codon] != "*"){
				nucleic+=codon;
				protein+=genetic_code[codon];
			}
			else{
				if (protein.length() > minimum_length){
					int adj_pos = 0;
					int longueur = 0;
					if (backward){
						position = multiFASTA_seqlen[glob_vec_index] - 1 - position;
					}

					position-=3; // because the current codon is a stop
					int begin = position - (protein.size() * 3) + 1;

					//reverse( nucleic.begin(), nucleic.end() ); // XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX

					if (!sCodons.empty()){
						if (find(sCodons.begin(), sCodons.end(), nucleic.substr(0, 3)) != sCodons.end()){ // if nucleic codon is in the vector of start codons
							outfile << frame << posneg << "\t" << begin << "-" << position << "\t" << protein << "\n";
							outfileDNA << frame << posneg << "\t" << begin << "-" << position << "\t" << nucleic << "\n";
						}
					}
					else{
						outfile << frame << posneg << "\t" << position << "\t" << protein << "\n";
						outfileDNA << frame << posneg << "\t" << position << "\t" << nucleic << "\n";
					}
				}
				protein = "";
				nucleic = "";
			}
		}
		else{
			protein+="X";
			nucleic+=codon;
		}
		codon = "";
	}
}


void translate_3_frames(string fastaFileName, string outputFileName, string outputFileNameDNA, bool backward){
	ifstream infile(fastaFileName);
	if (infile.good() == false){
		cerr << "Error: cannot open input file: " << fastaFileName << endl;
		exit(-1);
	}
	string str;

	string codon0 = "";
	string codon1 = "N";
	string codon2 = "NN";

	string protein0 = "";
	string protein1 = "";
	string protein2 = "";

	string nucleic0 = "";
	string nucleic1 = "";
	string nucleic2 = "";

	ofstream outfile;
	outfile.open (outputFileName);
	if (outfile.good() == false){
		cerr << "Error: cannot write output file: " << outputFileName << endl;
		exit(-1);
	}
	ofstream outfileDNA;
	outfileDNA.open (outputFileNameDNA);
	if (outfileDNA.good() == false){
		cerr << "Error: cannot write output file: " << outputFileNameDNA << endl;
		exit(-1);
	}

	int position = 0;
	int seqlen = 0;
	int glob_vec_index = 0;

	char posneg = '+';
	if (backward){
		posneg = '-';
	}

	while (getline(infile, str)) {
		if (str.find('>') != std::string::npos){
			if (backward){
				reverse(str.begin(), str.end());
				glob_vec_index++;
			}
			else{
				multiFASTA_seqlen.push_back(seqlen);
			}
			seqlen = 0;
			position = 0;
			outfile << str << "\n";
			outfileDNA << str << "\n";
		}
		else{
			for(char& c : str) { // c will never be a newline character
				c = toupper(c);
				position++;
				seqlen++;
				codon0+=c;
				codon1+=c;
				codon2+=c;

				extend_until_print(codon0, protein0, nucleic0, 0, posneg, outfile, outfileDNA, backward, position, glob_vec_index);
				extend_until_print(codon1, protein1, nucleic1, 1, posneg, outfile, outfileDNA, backward, position, glob_vec_index);
				extend_until_print(codon2, protein2, nucleic2, 2, posneg, outfile, outfileDNA, backward, position, glob_vec_index);
			}
		}
	}
	outfile.close();
	if (!backward){
		multiFASTA_seqlen.push_back(seqlen);
	}
}




int check_file_content(string fastaFileName, bool rna){
	ifstream infile(fastaFileName);
	if (infile.good() == false){
		cerr << "Error: cannot open input file: " << fastaFileName << endl;
		exit(-1);
	}
	string str;
	vector<char> v = {'A', 'G', 'C', 'R', 'Y', 'M', 'K', 'S', 'W', 'H', 'B', 'V', 'D', 'N'};
	(rna) ? v.push_back('U') : v.push_back('T');
	
	while (getline(infile, str)){
		if (str[0] != '>'){
			for(char& c : str){
				if (count(v.begin(), v.end(), c) == 0){
					return 1;
				}
			}
		}
	}
	return 0;
}


// input list of files XXX


int main(int argc, char** argv){
	string optlist =
		"    Usage:\n"
		"\n"
		"    -i              Input file\n"
		"    -o              Output filenames (default: input filename with .pro and .dna/.rna extensions)\n"
		"    -s              Start codon\n"
		"    -m              Minimum ORF length (default: 2)\n"
		"    -a              Alternative genetic code\n"
		"    -r              RNA input\n"
		"    -c              Check FASTA format\n"
		"    -f              Translate sense strand only\n" // f as in forward
		"    -h              Help\n";

	string outFileName;
	string inFileName;
	bool rna;
	bool checkFASTA;
	string alt_gene_code;
	string startCodon;
	bool senseStrand;


	int opt;
	while ((opt = getopt(argc,argv,"hrcfi:o:s:m:a:")) != EOF){
		switch(opt){
			case 'i': inFileName = optarg; break;
			case 'o': outFileName = optarg; break;
			case 's': startCodon = optarg; break;
			case 'm': minimum_length = atoi(optarg); break;
			case 'a': alt_gene_code = optarg; break;
			case 'r': rna = true; break;
			case 'c': checkFASTA = true; break;
			case 'f': senseStrand = true; break;
			case 'h': fprintf(stderr, "%s", optlist.c_str()); return 0;
		}
	}
	if (argc < 2){
		fprintf(stderr, "%s", optlist.c_str());
		return 0;
	}


	/* Check inputs */
	if (inFileName.empty()){
		cerr << "Error: missing input file (-i argument)" << endl;
		return(-1);
	}

	if (outFileName.empty()){
		vector<string> name_elements = split(inFileName, '/');
		outFileName = name_elements.back();
	}

	if (rna){
		puts("RNA sequence option activated");
	}

	if (startCodon.empty()){
		puts("Start codon: no specific amino acid selected");
	}
	else{
		sCodons = split(startCodon, '-');
		// XXX Check alternate
	}

	/* Check input file content */
	if (checkFASTA){
		puts("Checking file content...");
		if(check_file_content(inFileName, rna)){
			string nucleicType = (rna) ? "RNA" : "DNA";
			cerr << "Error: input file is not a valid " << nucleicType << " sequence" << endl;
			return(-1);
		}
		puts("Done");
	}


	/* Modifications to the standard genetic code */
	select_genetic_code(alt_gene_code);
	// XXX Check value


	/* Filenames: temporary AND final output */
	string tmp_reversed_input = "reversed_genome.tmp";
	string tmp_53_pro_file = outFileName+".53pro";
	string tmp_53_dna_file = outFileName+".53dna";
	string tmp_35_pro_file = outFileName+".35pro";
	string tmp_35_dna_file = outFileName+".35dna";
	string output_pro = outFileName+".pro";
	string output_dna = outFileName;
	(rna) ? output_dna+=".rna" : output_dna+=".dna";


	// XXX CHECK IF POSSIBLE TO CREATE..............





	/* Translation 5'-3' */
	puts("5'-3' translation...");
	(rna) ? genetic_code = gc_std : genetic_code = gc_std_DNA;
	translate_3_frames(inFileName, tmp_53_pro_file, tmp_53_dna_file, false);

	// XXX DIRTY CLEANING
	// awk '/^>/ {if (seq != "") {print head""seq;} seq=""; head=$0"\n";} /^[^>]/ {seq=seq""$0"\n";} END {if (seq != "") print head""seq;}' input.fasta > output.fasta
	string command = "awk '/^>/ {if (seq != \"\") {print head\"\"seq;} seq=\"\"; head=$0\"\\n\";} /^[^>]/ {seq=seq\"\"$0\"\\n\";} END {if (seq != \"\") print head\"\"seq;}' "+ tmp_53_pro_file + " > " + output_pro + " && rm *.53pro *.53dna";
	int status = std::system(command.c_str());
	if (status != 0) {
		std::cerr << "Error: system call failed with status " << status << std::endl;
		return 1;
	}
	puts("Done");


	if (not senseStrand){
		// TODO Warning for RNA sequence
		/* Reversion of the input sequence by system call: rev + tac */
		puts("Genome reversion...");
		if ( system( ("rev "+inFileName+" | tac > " + tmp_reversed_input).c_str() ) != 0 ){
			perror("Error while creating reversed genome file");
			return(-1);
		}
		else{
			puts ("Done");
		}
	
		/* Reversion of the (global) vector of lengths */
		reverse(multiFASTA_seqlen.begin(), multiFASTA_seqlen.end());
	
	
		/* Translation 3'-5' */
		puts("3'-5' translation...");
		(rna) ? genetic_code = gc_std_rev : genetic_code = gc_std_DNA_rev;
		sCodons = complement_sCodons(sCodons, rna);
		translate_3_frames(tmp_reversed_input, tmp_35_pro_file, tmp_35_dna_file, true);
		puts("Done");
	
	
		/* Reversion (tac) of the 3'-5' translation, then concatenation */
		puts("Creating result files...");
		if ( system( ("cat "+tmp_53_pro_file+" > "+output_pro+" && tac "+tmp_35_pro_file+" >> "+output_pro+
				" && cat "+tmp_53_dna_file+" > "+output_dna+" && tac "+tmp_35_dna_file+" >> "+output_dna).c_str() ) != 0 ){
			perror("Error while line-reversing and concatenating output files");
			return(-1);
		}
		else{
			puts("Done");
		}
	
		/* Delete the temporary files */
		puts("Cleaning temporary files...");
		if ( system( ("rm "+tmp_reversed_input+" "+tmp_35_pro_file+" "+tmp_35_dna_file+" "+tmp_53_pro_file+" "+tmp_53_dna_file).c_str() ) ){
			perror("Error while deleting temporary files");
			return(-1);
		}
		else{
			puts("Done");
		}
	}



	// XXX XXX DETECT SEQUENCE
	// => output sequence and surrounding



	// FASTA Format for ssearch

	puts("Program ran successfully!");
	return 0;
}