2 * classifyseqscommand.cpp
5 * Created by westcott on 11/2/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
13 #include "doTaxonomy.h"
16 //**********************************************************************************************************************
18 ClassifySeqsCommand::ClassifySeqsCommand(string option){
20 // globaldata = GlobalData::getInstance();
23 //allow user to run help
24 if(option == "help") { help(); abort = true; }
28 //valid paramters for this command
29 string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
30 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
32 OptionParser parser(option);
33 map<string, string> parameters = parser.getParameters();
35 ValidParameters validParameter;
37 //check to make sure all parameters are valid for command
38 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
39 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
42 //check for required parameters
43 templateFileName = validParameter.validFile(parameters, "template", true);
44 if (templateFileName == "not found") {
45 mothurOut("template is a required parameter for the classify.seqs command.");
49 else if (templateFileName == "not open") { abort = true; }
51 fastaFileName = validParameter.validFile(parameters, "fasta", true);
52 if (fastaFileName == "not found") {
53 mothurOut("fasta is a required parameter for the classify.seqs command.");
57 else if (fastaFileName == "not open") { abort = true; }
59 taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
60 if (taxonomyFileName == "not found") {
61 mothurOut("taxonomy is a required parameter for the classify.seqs command.");
65 else if (taxonomyFileName == "not open") { abort = true; }
68 //check for optional parameter and set defaults
69 // ...at some point should added some additional type checking...
71 temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; }
72 convert(temp, kmerSize);
74 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
75 convert(temp, processors);
77 search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
79 method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; }
81 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
84 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
85 convert(temp, misMatch);
87 temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
88 convert(temp, gapOpen);
90 temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
91 convert(temp, gapExtend);
93 temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; }
94 convert(temp, numWanted);
96 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0"; }
97 convert(temp, cutoff);
100 if ((method == "bayesian") && (search != "kmer")) {
101 mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
107 catch(exception& e) {
108 errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
113 //**********************************************************************************************************************
115 ClassifySeqsCommand::~ClassifySeqsCommand(){
117 if (abort == false) {
118 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
122 //**********************************************************************************************************************
124 void ClassifySeqsCommand::help(){
126 mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
127 mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
128 mothurOut("The template, fasta and taxonomy parameters are required.\n");
129 mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
130 mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
131 mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
132 mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
133 mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
134 mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
135 mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
136 mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
137 mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
138 mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n");
139 mothurOut("The classify.seqs command should be in the following format: \n");
140 mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
141 mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
142 mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
143 mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
144 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
146 catch(exception& e) {
147 errorOut(e, "ClassifySeqsCommand", "help");
153 //**********************************************************************************************************************
155 int ClassifySeqsCommand::execute(){
157 if (abort == true) { return 0; }
161 if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff); }
162 else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
164 mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
166 classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);
169 int numFastaSeqs = 0;
171 string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
172 string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
173 string taxSummary = getRootName(fastaFileName) + "tax.summary";
175 int start = time(NULL);
176 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
179 openInputFile(fastaFileName, inFASTA);
180 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
183 lines.push_back(new linePair(0, numFastaSeqs));
185 driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
189 vector<int> positions;
190 processIDS.resize(0);
193 openInputFile(fastaFileName, inFASTA);
196 while(!inFASTA.eof()){
197 input = getline(inFASTA);
198 if (input.length() != 0) {
199 if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
204 numFastaSeqs = positions.size();
206 int numSeqsPerProcessor = numFastaSeqs / processors;
208 for (int i = 0; i < processors; i++) {
209 int startPos = positions[ i * numSeqsPerProcessor ];
210 if(i == processors - 1){
211 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
213 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
215 createProcesses(newTaxonomyFile, tempTaxonomyFile);
217 rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
218 rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
220 for(int i=1;i<processors;i++){
221 appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
222 appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
223 remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
224 remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
230 openInputFile(candidateFileName, inFASTA);
231 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
234 lines.push_back(new linePair(0, numFastaSeqs));
236 driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
241 //make taxonomy tree from new taxonomy file
243 openInputFile(tempTaxonomyFile, inTaxonomy);
245 string accession, taxaList;
246 PhyloTree taxaBrowser;
248 //read in users taxonomy file and add sequences to tree
249 while(!inTaxonomy.eof()){
250 inTaxonomy >> accession >> taxaList;
252 taxaBrowser.addSeqToTree(accession, taxaList);
257 remove(tempTaxonomyFile.c_str());
259 taxaBrowser.assignHeirarchyIDs(0);
262 openOutputFile(taxSummary, outTaxTree);
264 taxaBrowser.print(outTaxTree);
267 mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
273 catch(exception& e) {
274 errorOut(e, "ClassifySeqsCommand", "execute");
278 /**************************************************************************************************/
280 void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
282 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
284 // processIDS.resize(0);
286 //loop through and create all the processes you want
287 while (process != processors) {
291 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
294 driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
296 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
299 //force parent to wait until all the processes are done
300 for (int i=0;i<processors;i++) {
301 int temp = processIDS[i];
306 catch(exception& e) {
307 errorOut(e, "ClassifySeqsCommand", "createProcesses");
311 /**************************************************************************************************/
313 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
318 openOutputFileAppend(filename, output);
319 openInputFile(temp, input);
321 while(char c = input.get()){
322 if(input.eof()) { break; }
323 else { output << c; }
329 catch(exception& e) {
330 errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
335 //**********************************************************************************************************************
337 int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
340 openOutputFile(taxFName, outTax);
342 ofstream outTaxSimple;
343 openOutputFile(tempTFName, outTaxSimple);
346 openInputFile(fastaFileName, inFASTA);
348 inFASTA.seekg(line->start);
352 for(int i=0;i<line->numSeqs;i++){
354 Sequence* candidateSeq = new Sequence(inFASTA);
356 taxonomy = classify->getTaxonomy(candidateSeq);
358 if (taxonomy != "bad seq") {
359 //if (method != "bayesian") {
360 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
361 outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
363 //vector<string> pTax = classify->parseTax(taxonomy);
364 //map<string, int> confidence = classify->getConfidenceScores();
366 //outTax << candidateSeq->getName() << '\t';
367 //for (int j = 0; j < pTax.size(); j++) {
368 //if (confidence[pTax[j]] > cutoff) {
369 // outTax << pTax[j] << "(" << confidence[pTax[j]] << ");";
379 mothurOut("Classifying sequence " + toString(i)); mothurOutEndLine();
385 outTaxSimple.close();
389 catch(exception& e) {
390 errorOut(e, "ClassifySeqsCommand", "driver");
395 /**************************************************************************************************/