]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
added citation function to commands
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "phylotree.h"
14 #include "phylosummary.h"
15 #include "knn.h"
16
17
18 //**********************************************************************************************************************
19 vector<string> ClassifySeqsCommand::setParameters(){    
20         try {
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
22                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
23                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
24                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
25                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
26                 CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
27                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
28                 CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
31                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
32                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
33                 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
34                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
35                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
36                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
37                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                 
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ClassifySeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string ClassifySeqsCommand::getHelpString(){    
52         try {
53                 string helpString = "";
54                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
55                 helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
56                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
57                 helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
58                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
59                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
60                 helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
61                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
62                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
63 #ifdef USE_MPI
64                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
65 #endif
66                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
67                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
68                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
69                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
70                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
71                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
72                 helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
73                 helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
74                 helpString += "The classify.seqs command should be in the following format: \n";
75                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
76                 helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n";
77                 helpString += "The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n";
78                 helpString += "The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n";
79                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
80                 return helpString;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ClassifySeqsCommand", "getHelpString");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 ClassifySeqsCommand::ClassifySeqsCommand(){     
89         try {
90                 abort = true; calledHelp = true; 
91                 setParameters();
92                 vector<string> tempOutNames;
93                 outputTypes["taxonomy"] = tempOutNames;
94                 outputTypes["taxsummary"] = tempOutNames;
95                 outputTypes["matchdist"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string, string> parameters = parser.getParameters(); 
116                         
117                         ValidParameters validParameter("classify.seqs");
118                         map<string, string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126                         vector<string> tempOutNames;
127                         outputTypes["taxonomy"] = tempOutNames;
128                         outputTypes["taxsummary"] = tempOutNames;
129                         outputTypes["matchdist"] = tempOutNames;
130                         
131                         //if the user changes the output directory command factory will send this info to us in the output parameter 
132                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("reference");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
145                                 }
146                                 
147                                 it = parameters.find("taxonomy");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
153                                 }
154                                 
155                                 it = parameters.find("group");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
161                                 }
162                         }
163
164                         //check for required parameters
165                         templateFileName = validParameter.validFile(parameters, "reference", true);
166                         if (templateFileName == "not found") { 
167                                 m->mothurOut("reference is a required parameter for the classify.seqs command."); 
168                                 m->mothurOutEndLine();
169                                 abort = true; 
170                         }
171                         else if (templateFileName == "not open") { abort = true; }      
172                         
173                                                 
174                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
175                         if (fastaFileName == "not found") {                             
176                                 //if there is a current fasta file, use it
177                                 string filename = m->getFastaFile(); 
178                                 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
179                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
180                         }
181                         else { 
182                                 m->splitAtDash(fastaFileName, fastaFileNames);
183                                 
184                                 //go through files and make sure they are good, if not, then disregard them
185                                 for (int i = 0; i < fastaFileNames.size(); i++) {
186                                         
187                                         bool ignore = false;
188                                         if (fastaFileNames[i] == "current") { 
189                                                 fastaFileNames[i] = m->getFastaFile(); 
190                                                 if (fastaFileNames[i] != "") {  m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
191                                                 else {  
192                                                         m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
193                                                         //erase from file list
194                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
195                                                         i--;
196                                                 }
197                                         }
198                                         
199                                         if (!ignore) {
200                                                 
201                                                 if (inputDir != "") {
202                                                         string path = m->hasPath(fastaFileNames[i]);
203                                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                                         if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
205                                                 }
206                                                 
207                                                 int ableToOpen;
208                                                 
209                                                 ifstream in;
210                                                 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
211                                         
212                                                 //if you can't open it, try default location
213                                                 if (ableToOpen == 1) {
214                                                         if (m->getDefaultPath() != "") { //default path is set
215                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
216                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
217                                                                 ifstream in2;
218                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
219                                                                 in2.close();
220                                                                 fastaFileNames[i] = tryPath;
221                                                         }
222                                                 }
223                                                 
224                                                 if (ableToOpen == 1) {
225                                                         if (m->getOutputDir() != "") { //default path is set
226                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
227                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
228                                                                 ifstream in2;
229                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
230                                                                 in2.close();
231                                                                 fastaFileNames[i] = tryPath;
232                                                         }
233                                                 }
234                                                 
235                                                 in.close();
236                                                 
237                                                 if (ableToOpen == 1) { 
238                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
239                                                         //erase from file list
240                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
241                                                         i--;
242                                                 }
243                                         }
244                                         
245                                 }
246                                 
247                                 //make sure there is at least one valid file left
248                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
249                         }
250
251                         
252                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
253                         if (taxonomyFileName == "not found") { 
254                                 m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
255                                 m->mothurOutEndLine();
256                                 abort = true; 
257                         }
258                         else if (taxonomyFileName == "not open") { abort = true; }      
259                         
260                         
261                         namefile = validParameter.validFile(parameters, "name", false);
262                         if (namefile == "not found") { namefile = "";  }
263
264                         else { 
265                                 m->splitAtDash(namefile, namefileNames);
266                                 
267                                 //go through files and make sure they are good, if not, then disregard them
268                                 for (int i = 0; i < namefileNames.size(); i++) {
269                                         bool ignore = false;
270                                         if (namefileNames[i] == "current") { 
271                                                 namefileNames[i] = m->getNameFile(); 
272                                                 if (namefileNames[i] != "") {  m->mothurOut("Using " + namefileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
273                                                 else {  
274                                                         m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
275                                                         //erase from file list
276                                                         namefileNames.erase(namefileNames.begin()+i);
277                                                         i--;
278                                                 }
279                                         }
280                                         
281                                         if (!ignore) {
282                                                 
283                                                 if (inputDir != "") {
284                                                         string path = m->hasPath(namefileNames[i]);
285                                                         //if the user has not given a path then, add inputdir. else leave path alone.
286                                                         if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
287                                                 }
288                                                 int ableToOpen;
289                                                 
290                                                 ifstream in;
291                                                 ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
292                                         
293                                                 //if you can't open it, try default location
294                                                 if (ableToOpen == 1) {
295                                                         if (m->getDefaultPath() != "") { //default path is set
296                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
297                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
298                                                                 ifstream in2;
299                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
300                                                                 in2.close();
301                                                                 namefileNames[i] = tryPath;
302                                                         }
303                                                 }
304                                                 
305                                                 if (ableToOpen == 1) {
306                                                         if (m->getOutputDir() != "") { //default path is set
307                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
308                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
309                                                                 ifstream in2;
310                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
311                                                                 in2.close();
312                                                                 namefileNames[i] = tryPath;
313                                                         }
314                                                 }
315                                                 in.close();
316                                                 
317                                                 if (ableToOpen == 1) { 
318                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();  abort = true;
319                                                         //erase from file list
320                                                         namefileNames.erase(namefileNames.begin()+i);
321                                                         i--;
322                                                 }
323                                         }
324                                 }
325                         }
326
327                         if (namefile != "") {
328                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
329                         }
330                         
331                         groupfile = validParameter.validFile(parameters, "group", false);
332                         if (groupfile == "not found") { groupfile = "";  }
333                         else { 
334                                 m->splitAtDash(groupfile, groupfileNames);
335                                 
336                                 //go through files and make sure they are good, if not, then disregard them
337                                 for (int i = 0; i < groupfileNames.size(); i++) {
338                                         if (inputDir != "") {
339                                                 string path = m->hasPath(groupfileNames[i]);
340                                                 //if the user has not given a path then, add inputdir. else leave path alone.
341                                                 if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
342                                         }
343                                         int ableToOpen;
344                                         
345                                         ifstream in;
346                                         ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
347                                 
348                                         //if you can't open it, try default location
349                                         if (ableToOpen == 1) {
350                                                 if (m->getDefaultPath() != "") { //default path is set
351                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
352                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
353                                                         ifstream in2;
354                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
355                                                         in2.close();
356                                                         groupfileNames[i] = tryPath;
357                                                 }
358                                         }
359                                         
360                                         if (ableToOpen == 1) {
361                                                 if (m->getOutputDir() != "") { //default path is set
362                                                         string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
363                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
364                                                         ifstream in2;
365                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
366                                                         in2.close();
367                                                         groupfileNames[i] = tryPath;
368                                                 }
369                                         }
370                                         
371                                         in.close();
372                                         
373                                         if (ableToOpen == 1) { 
374                                                 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
375                                                 //erase from file list
376                                                 groupfileNames.erase(groupfileNames.begin()+i);
377                                                 i--;
378                                         }
379                                 }
380                         }
381
382                         if (groupfile != "") {
383                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
384                         }else {
385                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
386                         }
387                         
388                         //check for optional parameter and set defaults
389                         // ...at some point should added some additional type checking...
390                         string temp;
391                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
392                         convert(temp, kmerSize); 
393                         
394                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
395                         m->setProcessors(temp);
396                         convert(temp, processors); 
397                         
398                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
399                         
400                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
401                         
402                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
403                         convert(temp, match);  
404                         
405                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
406                         convert(temp, misMatch);  
407                         
408                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
409                         convert(temp, gapOpen);  
410                         
411                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
412                         convert(temp, gapExtend); 
413                         
414                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
415                         convert(temp, numWanted);
416                         
417                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
418                         convert(temp, cutoff);
419                         
420                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
421                         probs = m->isTrue(temp);
422                         
423                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
424                         convert(temp, iters); 
425
426
427                         
428                         if ((method == "bayesian") && (search != "kmer"))  { 
429                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
430                                 search = "kmer";
431                         }
432                 }
433                 
434         }
435         catch(exception& e) {
436                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
437                 exit(1);
438         }
439 }
440
441 //**********************************************************************************************************************
442 ClassifySeqsCommand::~ClassifySeqsCommand(){    
443         if (abort == false) {
444                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
445         }
446 }
447 //**********************************************************************************************************************
448
449 int ClassifySeqsCommand::execute(){
450         try {
451                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
452                 
453                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
454                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
455                 else {
456                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
457                         m->mothurOutEndLine();
458                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
459                 }
460                 
461                 if (m->control_pressed) { delete classify; return 0; }
462                 
463                                 
464                 for (int s = 0; s < fastaFileNames.size(); s++) {
465                 
466                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
467                         
468                         string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName));
469                         RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
470                         if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
471                         RippedTaxName +=  "."; 
472                 
473                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
474                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
475                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
476                         string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
477                         
478                         if ((method == "knn") && (search == "distance")) { 
479                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
480                                 classify->setDistName(DistName);  outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
481                         }
482                         
483                         outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
484                         outputNames.push_back(taxSummary);      outputTypes["taxsummary"].push_back(taxSummary);
485                         
486                         int start = time(NULL);
487                         int numFastaSeqs = 0;
488                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
489                         
490 #ifdef USE_MPI  
491                                 int pid, numSeqsPerProcessor; 
492                                 int tag = 2001;
493                                 vector<unsigned long int> MPIPos;
494                                 
495                                 MPI_Status status; 
496                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
497                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
498
499                                 MPI_File inMPI;
500                                 MPI_File outMPINewTax;
501                                 MPI_File outMPITempTax;
502                                                         
503                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
504                                 int inMode=MPI_MODE_RDONLY; 
505                                 
506                                 char outNewTax[1024];
507                                 strcpy(outNewTax, newTaxonomyFile.c_str());
508                                 
509                                 char outTempTax[1024];
510                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
511                                 
512                                 char inFileName[1024];
513                                 strcpy(inFileName, fastaFileNames[s].c_str());
514
515                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
516                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
517                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
518                                 
519                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
520                                 
521                                 if (pid == 0) { //you are the root process 
522                                         
523                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
524                                         
525                                         //send file positions to all processes
526                                         for(int i = 1; i < processors; i++) { 
527                                                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
528                                                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
529                                         }
530                                         
531                                         //figure out how many sequences you have to align
532                                         numSeqsPerProcessor = numFastaSeqs / processors;
533                                         int startIndex =  pid * numSeqsPerProcessor;
534                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
535                                         
536                                 
537                                         //align your part
538                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
539                                         
540                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0;  }
541                                         
542                                         for (int i = 1; i < processors; i++) {
543                                                 int done;
544                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
545                                         }
546                                 }else{ //you are a child process
547                                         MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
548                                         MPIPos.resize(numFastaSeqs+1);
549                                         MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
550                                         
551                                         //figure out how many sequences you have to align
552                                         numSeqsPerProcessor = numFastaSeqs / processors;
553                                         int startIndex =  pid * numSeqsPerProcessor;
554                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
555                                         
556                                         
557                                         //align your part
558                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
559                                         
560                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
561
562                                         int done = 0;
563                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
564                                 }
565                                 
566                                 //close files 
567                                 MPI_File_close(&inMPI);
568                                 MPI_File_close(&outMPINewTax);
569                                 MPI_File_close(&outMPITempTax);
570                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
571                                 
572 #else
573                 
574                         vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
575                                 
576                         for (int i = 0; i < (positions.size()-1); i++) {
577                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
578                         }       
579                         
580                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
581                         if(processors == 1){
582                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
583                         }
584                         else{
585                                 processIDS.resize(0);
586                                 
587                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
588                                 
589                         }
590         #else
591                         numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
592         #endif  
593 #endif
594
595                 m->mothurOutEndLine();
596                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
597                 start = time(NULL);
598
599
600                 #ifdef USE_MPI  
601                         if (pid == 0) {  //this part does not need to be paralellized
602                         
603                                 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();  MPIReadNamesFile(namefileNames[s]);  m->mothurOut("  Done."); m->mothurOutEndLine(); }
604                 #else
605                         //read namefile
606                         if(namefile != "") {
607                         
608                             m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
609                                 
610                                 nameMap.clear(); //remove old names
611                                 
612                                 ifstream inNames;
613                                 m->openInputFile(namefileNames[s], inNames);
614                                 
615                                 string firstCol, secondCol;
616                                 while(!inNames.eof()) {
617                                         inNames >> firstCol >> secondCol; m->gobble(inNames);
618                                         
619                                         vector<string> temp;
620                                         m->splitAtComma(secondCol, temp);
621                         
622                                         nameMap[firstCol] = temp;  
623                                 }
624                                 inNames.close();
625                                 
626                                 m->mothurOut("  Done."); m->mothurOutEndLine();
627                         }
628                 #endif
629
630                         string group = "";
631                         if (groupfile != "") {  group = groupfileNames[s]; }
632                         
633                         PhyloSummary taxaSum(taxonomyFileName, group);
634                         
635                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
636                 
637                         if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
638                         else {
639                                 ifstream in;
640                                 m->openInputFile(tempTaxonomyFile, in);
641                                 
642                                 //read in users taxonomy file and add sequences to tree
643                                 string name, taxon;
644                                 
645                                 while(!in.eof()){
646                                         in >> name >> taxon; m->gobble(in);
647                                         
648                                         itNames = nameMap.find(name);
649                 
650                                         if (itNames == nameMap.end()) { 
651                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
652                                         }else{
653                                                 for (int i = 0; i < itNames->second.size(); i++) { 
654                                                         taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
655                                                 }
656                                                 itNames->second.clear();
657                                                 nameMap.erase(itNames->first);
658                                         }
659                                 }
660                                 in.close();
661                         }
662                         remove(tempTaxonomyFile.c_str());
663                         
664                         if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
665                         
666                         //print summary file
667                         ofstream outTaxTree;
668                         m->openOutputFile(taxSummary, outTaxTree);
669                         taxaSum.print(outTaxTree);
670                         outTaxTree.close();
671                         
672                         //output taxonomy with the unclassified bins added
673                         ifstream inTax;
674                         m->openInputFile(newTaxonomyFile, inTax);
675                         
676                         ofstream outTax;
677                         string unclass = newTaxonomyFile + ".unclass.temp";
678                         m->openOutputFile(unclass, outTax);
679                         
680                         //get maxLevel from phylotree so you know how many 'unclassified's to add
681                         int maxLevel = taxaSum.getMaxLevel();
682                                                         
683                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
684                         string name, taxon;
685                         while (!inTax.eof()) {
686                                 if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
687
688                                 inTax >> name >> taxon; m->gobble(inTax);
689                                 
690                                 string newTax = addUnclassifieds(taxon, maxLevel);
691                                 
692                                 outTax << name << '\t' << newTax << endl;
693                         }
694                         inTax.close();  
695                         outTax.close();
696                         
697                         remove(newTaxonomyFile.c_str());
698                         rename(unclass.c_str(), newTaxonomyFile.c_str());
699                         
700                         m->mothurOutEndLine();
701                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
702                         
703                         #ifdef USE_MPI  
704                                 }
705                         #endif
706
707                         m->mothurOutEndLine();
708                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
709                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
710                         m->mothurOutEndLine();
711                 }
712                 
713                 //set taxonomy file as new current taxonomyfile
714                 string current = "";
715                 itTypes = outputTypes.find("taxonomy");
716                 if (itTypes != outputTypes.end()) {
717                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
718                 }
719                 
720                 delete classify;
721                 return 0;
722         }
723         catch(exception& e) {
724                 m->errorOut(e, "ClassifySeqsCommand", "execute");
725                 exit(1);
726         }
727 }
728
729 /**************************************************************************************************/
730 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
731         try{
732                 string newTax, taxon;
733                 int level = 0;
734                 
735                 //keep what you have counting the levels
736                 while (tax.find_first_of(';') != -1) {
737                         //get taxon
738                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
739                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
740                         newTax += taxon;
741                         level++;
742                 }
743                 
744                 //add "unclassified" until you reach maxLevel
745                 while (level < maxlevel) {
746                         newTax += "unclassified;";
747                         level++;
748                 }
749                 
750                 return newTax;
751         }
752         catch(exception& e) {
753                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
754                 exit(1);
755         }
756 }
757
758 /**************************************************************************************************/
759
760 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
761         try {
762 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
763                 int process = 1;
764                 int num = 0;
765                 
766                 //loop through and create all the processes you want
767                 while (process != processors) {
768                         int pid = fork();
769                         
770                         if (pid > 0) {
771                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
772                                 process++;
773                         }else if (pid == 0){
774                                 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
775                                 
776                                 //pass numSeqs to parent
777                                 ofstream out;
778                                 string tempFile = filename + toString(getpid()) + ".num.temp";
779                                 m->openOutputFile(tempFile, out);
780                                 out << num << endl;
781                                 out.close();
782
783                                 exit(0);
784                         }else { 
785                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
786                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
787                                 exit(0);
788                         }
789                 }
790                 
791                 //parent does its part
792                 num = driver(lines[0], taxFileName, tempTaxFile, filename);
793                 
794                 //force parent to wait until all the processes are done
795                 for (int i=0;i<processIDS.size();i++) { 
796                         int temp = processIDS[i];
797                         wait(&temp);
798                 }
799                 
800                 for (int i = 0; i < processIDS.size(); i++) {
801                         ifstream in;
802                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
803                         m->openInputFile(tempFile, in);
804                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
805                         in.close(); remove(tempFile.c_str());
806                 }
807                 
808                 for(int i=0;i<processIDS.size();i++){
809                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
810                         appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
811                         remove((taxFileName + toString(processIDS[i]) + ".temp").c_str());
812                         remove((tempTaxFile + toString(processIDS[i]) + ".temp").c_str());
813                 }
814                 
815                 return num;
816 #endif          
817         }
818         catch(exception& e) {
819                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
820                 exit(1);
821         }
822 }
823 /**************************************************************************************************/
824
825 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
826         try{
827                 
828                 ofstream output;
829                 ifstream input;
830                 m->openOutputFileAppend(filename, output);
831                 m->openInputFile(temp, input);
832                 
833                 while(char c = input.get()){
834                         if(input.eof())         {       break;                  }
835                         else                            {       output << c;    }
836                 }
837                 
838                 input.close();
839                 output.close();
840         }
841         catch(exception& e) {
842                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
843                 exit(1);
844         }
845 }
846
847 //**********************************************************************************************************************
848
849 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){
850         try {
851                 ofstream outTax;
852                 m->openOutputFile(taxFName, outTax);
853                 
854                 ofstream outTaxSimple;
855                 m->openOutputFile(tempTFName, outTaxSimple);
856         
857                 ifstream inFASTA;
858                 m->openInputFile(filename, inFASTA);
859                 
860                 string taxonomy;
861
862                 inFASTA.seekg(filePos->start);
863
864                 bool done = false;
865                 int count = 0;
866                 
867                 while (!done) {
868                         if (m->control_pressed) { return 0; }
869                 
870                         Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
871                         
872                         if (candidateSeq->getName() != "") {
873                         
874                                 taxonomy = classify->getTaxonomy(candidateSeq);
875                                 
876                                 if (m->control_pressed) { delete candidateSeq; return 0; }
877
878                                 if (taxonomy != "bad seq") {
879                                         //output confidence scores or not
880                                         if (probs) {
881                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
882                                         }else{
883                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
884                                         }
885                                         
886                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
887                                 }
888                                 count++;
889                         }
890                         delete candidateSeq;
891                         
892                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
893                                 unsigned long int pos = inFASTA.tellg();
894                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
895                         #else
896                                 if (inFASTA.eof()) { break; }
897                         #endif
898                         
899                         //report progress
900                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
901                         
902                 }
903                 //report progress
904                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
905                         
906                 inFASTA.close();
907                 outTax.close();
908                 outTaxSimple.close();
909                 
910                 return count;
911         }
912         catch(exception& e) {
913                 m->errorOut(e, "ClassifySeqsCommand", "driver");
914                 exit(1);
915         }
916 }
917 //**********************************************************************************************************************
918 #ifdef USE_MPI
919 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
920         try {
921                 MPI_Status statusNew; 
922                 MPI_Status statusTemp; 
923                 MPI_Status status; 
924                 
925                 int pid;
926                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
927         
928                 string taxonomy;
929                 string outputString;
930
931                 for(int i=0;i<num;i++){
932                 
933                         if (m->control_pressed) { return 0; }
934                 
935                         //read next sequence
936                         int length = MPIPos[start+i+1] - MPIPos[start+i];
937                         char* buf4 = new char[length];
938                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
939                         
940                         string tempBuf = buf4;
941                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
942                         istringstream iss (tempBuf,istringstream::in);
943                         delete buf4;
944
945                         Sequence* candidateSeq = new Sequence(iss);
946                         
947                         if (candidateSeq->getName() != "") {
948                                 taxonomy = classify->getTaxonomy(candidateSeq);
949                                 
950                                 if (taxonomy != "bad seq") {
951                                         //output confidence scores or not
952                                         if (probs) {
953                                                 outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
954                                         }else{
955                                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
956                                         }
957                                         
958                                         int length = outputString.length();
959                                         char* buf2 = new char[length];
960                                         memcpy(buf2, outputString.c_str(), length);
961                                 
962                                         MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
963                                         delete buf2;
964
965                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
966                                         length = outputString.length();
967                                         char* buf = new char[length];
968                                         memcpy(buf, outputString.c_str(), length);
969                                 
970                                         MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
971                                         delete buf;
972                                 }
973                         }                               
974                         delete candidateSeq;
975                         
976                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
977                 }
978                 
979                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
980                 
981                 
982                 return 1;
983         }
984         catch(exception& e) {
985                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
986                 exit(1);
987         }
988 }
989
990 //**********************************************************************************************************************
991 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
992         try {
993         
994                 nameMap.clear(); //remove old names
995                 
996                 MPI_File inMPI;
997                 MPI_Offset size;
998                 MPI_Status status;
999
1000                 //char* inFileName = new char[nameFilename.length()];
1001                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
1002                 
1003                 char inFileName[1024];
1004                 strcpy(inFileName, nameFilename.c_str());
1005
1006                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
1007                 MPI_File_get_size(inMPI, &size);
1008                 //delete inFileName;
1009
1010                 char* buffer = new char[size];
1011                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
1012
1013                 string tempBuf = buffer;
1014                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
1015                 istringstream iss (tempBuf,istringstream::in);
1016                 delete buffer;
1017                 
1018                 string firstCol, secondCol;
1019                 while(!iss.eof()) {
1020                         iss >> firstCol >> secondCol; m->gobble(iss);
1021                         
1022                         vector<string> temp;
1023                         m->splitAtComma(secondCol, temp);
1024                         
1025                         nameMap[firstCol] = temp;  
1026                 }
1027         
1028                 MPI_File_close(&inMPI);
1029                 
1030                 return 1;
1031         }
1032         catch(exception& e) {
1033                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1034                 exit(1);
1035         }
1036 }
1037 #endif
1038 /**************************************************************************************************/