]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
added current as option is lists of file names, processors now outputted with current...
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "phylotree.h"
14 #include "phylosummary.h"
15 #include "knn.h"
16
17
18 //**********************************************************************************************************************
19 vector<string> ClassifySeqsCommand::setParameters(){    
20         try {
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
22                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
23                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
24                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
25                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
26                 CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
27                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
28                 CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
31                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
32                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
33                 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
34                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
35                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
36                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
37                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                 
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ClassifySeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string ClassifySeqsCommand::getHelpString(){    
52         try {
53                 string helpString = "";
54                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
55                 helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
56                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
57                 helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
58                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
59                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
60                 helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
61                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
62                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
63 #ifdef USE_MPI
64                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
65 #endif
66                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
67                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
68                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
69                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
70                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
71                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
72                 helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
73                 helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
74                 helpString += "The classify.seqs command should be in the following format: \n";
75                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
76                 helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n";
77                 helpString += "The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n";
78                 helpString += "The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n";
79                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
80                 return helpString;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ClassifySeqsCommand", "getHelpString");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 ClassifySeqsCommand::ClassifySeqsCommand(){     
89         try {
90                 abort = true; calledHelp = true; 
91                 setParameters();
92                 vector<string> tempOutNames;
93                 outputTypes["taxonomy"] = tempOutNames;
94                 outputTypes["taxsummary"] = tempOutNames;
95                 outputTypes["matchdist"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 
110                 else {
111                         vector<string> myArray = setParameters();
112                         
113                         OptionParser parser(option);
114                         map<string, string> parameters = parser.getParameters(); 
115                         
116                         ValidParameters validParameter("classify.seqs");
117                         map<string, string>::iterator it;
118                         
119                         //check to make sure all parameters are valid for command
120                         for (it = parameters.begin(); it != parameters.end(); it++) { 
121                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
122                         }
123                         
124                         //initialize outputTypes
125                         vector<string> tempOutNames;
126                         outputTypes["taxonomy"] = tempOutNames;
127                         outputTypes["taxsummary"] = tempOutNames;
128                         outputTypes["matchdist"] = tempOutNames;
129                         
130                         //if the user changes the output directory command factory will send this info to us in the output parameter 
131                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
132                         
133                         //if the user changes the input directory command factory will send this info to us in the output parameter 
134                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
135                         if (inputDir == "not found"){   inputDir = "";          }
136                         else {
137                                 string path;
138                                 it = parameters.find("reference");
139                                 //user has given a template file
140                                 if(it != parameters.end()){ 
141                                         path = m->hasPath(it->second);
142                                         //if the user has not given a path then, add inputdir. else leave path alone.
143                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
144                                 }
145                                 
146                                 it = parameters.find("taxonomy");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
152                                 }
153                                 
154                                 it = parameters.find("group");
155                                 //user has given a template file
156                                 if(it != parameters.end()){ 
157                                         path = m->hasPath(it->second);
158                                         //if the user has not given a path then, add inputdir. else leave path alone.
159                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
160                                 }
161                         }
162
163                         //check for required parameters
164                         templateFileName = validParameter.validFile(parameters, "reference", true);
165                         if (templateFileName == "not found") { 
166                                 m->mothurOut("reference is a required parameter for the classify.seqs command."); 
167                                 m->mothurOutEndLine();
168                                 abort = true; 
169                         }
170                         else if (templateFileName == "not open") { abort = true; }      
171                         
172                                                 
173                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
174                         if (fastaFileName == "not found") {                             
175                                 //if there is a current fasta file, use it
176                                 string filename = m->getFastaFile(); 
177                                 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
178                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
179                         }
180                         else { 
181                                 m->splitAtDash(fastaFileName, fastaFileNames);
182                                 
183                                 //go through files and make sure they are good, if not, then disregard them
184                                 for (int i = 0; i < fastaFileNames.size(); i++) {
185                                         
186                                         bool ignore = false;
187                                         if (fastaFileNames[i] == "current") { 
188                                                 fastaFileNames[i] = m->getFastaFile(); 
189                                                 if (fastaFileNames[i] != "") {  m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
190                                                 else {  
191                                                         m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
192                                                         //erase from file list
193                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
194                                                         i--;
195                                                 }
196                                         }
197                                         
198                                         if (!ignore) {
199                                                 
200                                                 if (inputDir != "") {
201                                                         string path = m->hasPath(fastaFileNames[i]);
202                                                         //if the user has not given a path then, add inputdir. else leave path alone.
203                                                         if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
204                                                 }
205                                                 
206                                                 int ableToOpen;
207                                                 
208                                                 ifstream in;
209                                                 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
210                                         
211                                                 //if you can't open it, try default location
212                                                 if (ableToOpen == 1) {
213                                                         if (m->getDefaultPath() != "") { //default path is set
214                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
215                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
216                                                                 ifstream in2;
217                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
218                                                                 in2.close();
219                                                                 fastaFileNames[i] = tryPath;
220                                                         }
221                                                 }
222                                                 
223                                                 if (ableToOpen == 1) {
224                                                         if (m->getOutputDir() != "") { //default path is set
225                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
226                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
227                                                                 ifstream in2;
228                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
229                                                                 in2.close();
230                                                                 fastaFileNames[i] = tryPath;
231                                                         }
232                                                 }
233                                                 
234                                                 in.close();
235                                                 
236                                                 if (ableToOpen == 1) { 
237                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
238                                                         //erase from file list
239                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
240                                                         i--;
241                                                 }
242                                         }
243                                         
244                                 }
245                                 
246                                 //make sure there is at least one valid file left
247                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
248                         }
249
250                         
251                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
252                         if (taxonomyFileName == "not found") { 
253                                 m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
254                                 m->mothurOutEndLine();
255                                 abort = true; 
256                         }
257                         else if (taxonomyFileName == "not open") { abort = true; }      
258                         
259                         
260                         namefile = validParameter.validFile(parameters, "name", false);
261                         if (namefile == "not found") { namefile = "";  }
262
263                         else { 
264                                 m->splitAtDash(namefile, namefileNames);
265                                 
266                                 //go through files and make sure they are good, if not, then disregard them
267                                 for (int i = 0; i < namefileNames.size(); i++) {
268                                         bool ignore = false;
269                                         if (namefileNames[i] == "current") { 
270                                                 namefileNames[i] = m->getNameFile(); 
271                                                 if (namefileNames[i] != "") {  m->mothurOut("Using " + namefileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
272                                                 else {  
273                                                         m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
274                                                         //erase from file list
275                                                         namefileNames.erase(namefileNames.begin()+i);
276                                                         i--;
277                                                 }
278                                         }
279                                         
280                                         if (!ignore) {
281                                                 
282                                                 if (inputDir != "") {
283                                                         string path = m->hasPath(namefileNames[i]);
284                                                         //if the user has not given a path then, add inputdir. else leave path alone.
285                                                         if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
286                                                 }
287                                                 int ableToOpen;
288                                                 
289                                                 ifstream in;
290                                                 ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
291                                         
292                                                 //if you can't open it, try default location
293                                                 if (ableToOpen == 1) {
294                                                         if (m->getDefaultPath() != "") { //default path is set
295                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
296                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
297                                                                 ifstream in2;
298                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
299                                                                 in2.close();
300                                                                 namefileNames[i] = tryPath;
301                                                         }
302                                                 }
303                                                 
304                                                 if (ableToOpen == 1) {
305                                                         if (m->getOutputDir() != "") { //default path is set
306                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
307                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
308                                                                 ifstream in2;
309                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
310                                                                 in2.close();
311                                                                 namefileNames[i] = tryPath;
312                                                         }
313                                                 }
314                                                 in.close();
315                                                 
316                                                 if (ableToOpen == 1) { 
317                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();  abort = true;
318                                                         //erase from file list
319                                                         namefileNames.erase(namefileNames.begin()+i);
320                                                         i--;
321                                                 }
322                                         }
323                                 }
324                         }
325
326                         if (namefile != "") {
327                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
328                         }
329                         
330                         groupfile = validParameter.validFile(parameters, "group", false);
331                         if (groupfile == "not found") { groupfile = "";  }
332                         else { 
333                                 m->splitAtDash(groupfile, groupfileNames);
334                                 
335                                 //go through files and make sure they are good, if not, then disregard them
336                                 for (int i = 0; i < groupfileNames.size(); i++) {
337                                         if (inputDir != "") {
338                                                 string path = m->hasPath(groupfileNames[i]);
339                                                 //if the user has not given a path then, add inputdir. else leave path alone.
340                                                 if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
341                                         }
342                                         int ableToOpen;
343                                         
344                                         ifstream in;
345                                         ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
346                                 
347                                         //if you can't open it, try default location
348                                         if (ableToOpen == 1) {
349                                                 if (m->getDefaultPath() != "") { //default path is set
350                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
351                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
352                                                         ifstream in2;
353                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
354                                                         in2.close();
355                                                         groupfileNames[i] = tryPath;
356                                                 }
357                                         }
358                                         
359                                         if (ableToOpen == 1) {
360                                                 if (m->getOutputDir() != "") { //default path is set
361                                                         string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
362                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
363                                                         ifstream in2;
364                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
365                                                         in2.close();
366                                                         groupfileNames[i] = tryPath;
367                                                 }
368                                         }
369                                         
370                                         in.close();
371                                         
372                                         if (ableToOpen == 1) { 
373                                                 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
374                                                 //erase from file list
375                                                 groupfileNames.erase(groupfileNames.begin()+i);
376                                                 i--;
377                                         }
378                                 }
379                         }
380
381                         if (groupfile != "") {
382                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
383                         }else {
384                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
385                         }
386                         
387                         //check for optional parameter and set defaults
388                         // ...at some point should added some additional type checking...
389                         string temp;
390                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
391                         convert(temp, kmerSize); 
392                         
393                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
394                         m->setProcessors(temp);
395                         convert(temp, processors); 
396                         
397                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
398                         
399                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
400                         
401                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
402                         convert(temp, match);  
403                         
404                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
405                         convert(temp, misMatch);  
406                         
407                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
408                         convert(temp, gapOpen);  
409                         
410                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
411                         convert(temp, gapExtend); 
412                         
413                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
414                         convert(temp, numWanted);
415                         
416                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
417                         convert(temp, cutoff);
418                         
419                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
420                         probs = m->isTrue(temp);
421                         
422                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
423                         convert(temp, iters); 
424
425
426                         
427                         if ((method == "bayesian") && (search != "kmer"))  { 
428                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
429                                 search = "kmer";
430                         }
431                 }
432                 
433         }
434         catch(exception& e) {
435                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
436                 exit(1);
437         }
438 }
439
440 //**********************************************************************************************************************
441 ClassifySeqsCommand::~ClassifySeqsCommand(){    
442         if (abort == false) {
443                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
444         }
445 }
446 //**********************************************************************************************************************
447
448 int ClassifySeqsCommand::execute(){
449         try {
450                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
451                 
452                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
453                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
454                 else {
455                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
456                         m->mothurOutEndLine();
457                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
458                 }
459                 
460                 if (m->control_pressed) { delete classify; return 0; }
461                 
462                                 
463                 for (int s = 0; s < fastaFileNames.size(); s++) {
464                 
465                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
466                         
467                         string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName));
468                         RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
469                         if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
470                         RippedTaxName +=  "."; 
471                 
472                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
473                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
474                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
475                         string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
476                         
477                         if ((method == "knn") && (search == "distance")) { 
478                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
479                                 classify->setDistName(DistName);  outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
480                         }
481                         
482                         outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
483                         outputNames.push_back(taxSummary);      outputTypes["taxsummary"].push_back(taxSummary);
484                         
485                         int start = time(NULL);
486                         int numFastaSeqs = 0;
487                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
488                         
489 #ifdef USE_MPI  
490                                 int pid, numSeqsPerProcessor; 
491                                 int tag = 2001;
492                                 vector<unsigned long int> MPIPos;
493                                 
494                                 MPI_Status status; 
495                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
496                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
497
498                                 MPI_File inMPI;
499                                 MPI_File outMPINewTax;
500                                 MPI_File outMPITempTax;
501                                                         
502                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
503                                 int inMode=MPI_MODE_RDONLY; 
504                                 
505                                 char outNewTax[1024];
506                                 strcpy(outNewTax, newTaxonomyFile.c_str());
507                                 
508                                 char outTempTax[1024];
509                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
510                                 
511                                 char inFileName[1024];
512                                 strcpy(inFileName, fastaFileNames[s].c_str());
513
514                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
515                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
516                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
517                                 
518                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
519                                 
520                                 if (pid == 0) { //you are the root process 
521                                         
522                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
523                                         
524                                         //send file positions to all processes
525                                         for(int i = 1; i < processors; i++) { 
526                                                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
527                                                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
528                                         }
529                                         
530                                         //figure out how many sequences you have to align
531                                         numSeqsPerProcessor = numFastaSeqs / processors;
532                                         int startIndex =  pid * numSeqsPerProcessor;
533                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
534                                         
535                                 
536                                         //align your part
537                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
538                                         
539                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0;  }
540                                         
541                                         for (int i = 1; i < processors; i++) {
542                                                 int done;
543                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544                                         }
545                                 }else{ //you are a child process
546                                         MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
547                                         MPIPos.resize(numFastaSeqs+1);
548                                         MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
549                                         
550                                         //figure out how many sequences you have to align
551                                         numSeqsPerProcessor = numFastaSeqs / processors;
552                                         int startIndex =  pid * numSeqsPerProcessor;
553                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
554                                         
555                                         
556                                         //align your part
557                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
558                                         
559                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
560
561                                         int done = 0;
562                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
563                                 }
564                                 
565                                 //close files 
566                                 MPI_File_close(&inMPI);
567                                 MPI_File_close(&outMPINewTax);
568                                 MPI_File_close(&outMPITempTax);
569                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
570                                 
571 #else
572                 
573                         vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
574                                 
575                         for (int i = 0; i < (positions.size()-1); i++) {
576                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
577                         }       
578                         
579                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
580                         if(processors == 1){
581                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
582                         }
583                         else{
584                                 processIDS.resize(0);
585                                 
586                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
587                                 
588                         }
589         #else
590                         numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
591         #endif  
592 #endif
593
594                 m->mothurOutEndLine();
595                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
596                 start = time(NULL);
597
598
599                 #ifdef USE_MPI  
600                         if (pid == 0) {  //this part does not need to be paralellized
601                         
602                                 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();  MPIReadNamesFile(namefileNames[s]);  m->mothurOut("  Done."); m->mothurOutEndLine(); }
603                 #else
604                         //read namefile
605                         if(namefile != "") {
606                         
607                             m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
608                                 
609                                 nameMap.clear(); //remove old names
610                                 
611                                 ifstream inNames;
612                                 m->openInputFile(namefileNames[s], inNames);
613                                 
614                                 string firstCol, secondCol;
615                                 while(!inNames.eof()) {
616                                         inNames >> firstCol >> secondCol; m->gobble(inNames);
617                                         
618                                         vector<string> temp;
619                                         m->splitAtComma(secondCol, temp);
620                         
621                                         nameMap[firstCol] = temp;  
622                                 }
623                                 inNames.close();
624                                 
625                                 m->mothurOut("  Done."); m->mothurOutEndLine();
626                         }
627                 #endif
628
629                         string group = "";
630                         if (groupfile != "") {  group = groupfileNames[s]; }
631                         
632                         PhyloSummary taxaSum(taxonomyFileName, group);
633                         
634                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
635                 
636                         if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
637                         else {
638                                 ifstream in;
639                                 m->openInputFile(tempTaxonomyFile, in);
640                                 
641                                 //read in users taxonomy file and add sequences to tree
642                                 string name, taxon;
643                                 
644                                 while(!in.eof()){
645                                         in >> name >> taxon; m->gobble(in);
646                                         
647                                         itNames = nameMap.find(name);
648                 
649                                         if (itNames == nameMap.end()) { 
650                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
651                                         }else{
652                                                 for (int i = 0; i < itNames->second.size(); i++) { 
653                                                         taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
654                                                 }
655                                                 itNames->second.clear();
656                                                 nameMap.erase(itNames->first);
657                                         }
658                                 }
659                                 in.close();
660                         }
661                         remove(tempTaxonomyFile.c_str());
662                         
663                         if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
664                         
665                         //print summary file
666                         ofstream outTaxTree;
667                         m->openOutputFile(taxSummary, outTaxTree);
668                         taxaSum.print(outTaxTree);
669                         outTaxTree.close();
670                         
671                         //output taxonomy with the unclassified bins added
672                         ifstream inTax;
673                         m->openInputFile(newTaxonomyFile, inTax);
674                         
675                         ofstream outTax;
676                         string unclass = newTaxonomyFile + ".unclass.temp";
677                         m->openOutputFile(unclass, outTax);
678                         
679                         //get maxLevel from phylotree so you know how many 'unclassified's to add
680                         int maxLevel = taxaSum.getMaxLevel();
681                                                         
682                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
683                         string name, taxon;
684                         while (!inTax.eof()) {
685                                 if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
686
687                                 inTax >> name >> taxon; m->gobble(inTax);
688                                 
689                                 string newTax = addUnclassifieds(taxon, maxLevel);
690                                 
691                                 outTax << name << '\t' << newTax << endl;
692                         }
693                         inTax.close();  
694                         outTax.close();
695                         
696                         remove(newTaxonomyFile.c_str());
697                         rename(unclass.c_str(), newTaxonomyFile.c_str());
698                         
699                         m->mothurOutEndLine();
700                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
701                         
702                         #ifdef USE_MPI  
703                                 }
704                         #endif
705
706                         m->mothurOutEndLine();
707                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
708                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
709                         m->mothurOutEndLine();
710                 }
711                 
712                 //set taxonomy file as new current taxonomyfile
713                 string current = "";
714                 itTypes = outputTypes.find("taxonomy");
715                 if (itTypes != outputTypes.end()) {
716                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
717                 }
718                 
719                 delete classify;
720                 return 0;
721         }
722         catch(exception& e) {
723                 m->errorOut(e, "ClassifySeqsCommand", "execute");
724                 exit(1);
725         }
726 }
727
728 /**************************************************************************************************/
729 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
730         try{
731                 string newTax, taxon;
732                 int level = 0;
733                 
734                 //keep what you have counting the levels
735                 while (tax.find_first_of(';') != -1) {
736                         //get taxon
737                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
738                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
739                         newTax += taxon;
740                         level++;
741                 }
742                 
743                 //add "unclassified" until you reach maxLevel
744                 while (level < maxlevel) {
745                         newTax += "unclassified;";
746                         level++;
747                 }
748                 
749                 return newTax;
750         }
751         catch(exception& e) {
752                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
753                 exit(1);
754         }
755 }
756
757 /**************************************************************************************************/
758
759 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
760         try {
761 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
762                 int process = 1;
763                 int num = 0;
764                 
765                 //loop through and create all the processes you want
766                 while (process != processors) {
767                         int pid = fork();
768                         
769                         if (pid > 0) {
770                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
771                                 process++;
772                         }else if (pid == 0){
773                                 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
774                                 
775                                 //pass numSeqs to parent
776                                 ofstream out;
777                                 string tempFile = filename + toString(getpid()) + ".num.temp";
778                                 m->openOutputFile(tempFile, out);
779                                 out << num << endl;
780                                 out.close();
781
782                                 exit(0);
783                         }else { 
784                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
785                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
786                                 exit(0);
787                         }
788                 }
789                 
790                 //parent does its part
791                 num = driver(lines[0], taxFileName, tempTaxFile, filename);
792                 
793                 //force parent to wait until all the processes are done
794                 for (int i=0;i<processIDS.size();i++) { 
795                         int temp = processIDS[i];
796                         wait(&temp);
797                 }
798                 
799                 for (int i = 0; i < processIDS.size(); i++) {
800                         ifstream in;
801                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
802                         m->openInputFile(tempFile, in);
803                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
804                         in.close(); remove(tempFile.c_str());
805                 }
806                 
807                 for(int i=0;i<processIDS.size();i++){
808                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
809                         appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
810                         remove((taxFileName + toString(processIDS[i]) + ".temp").c_str());
811                         remove((tempTaxFile + toString(processIDS[i]) + ".temp").c_str());
812                 }
813                 
814                 return num;
815 #endif          
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
819                 exit(1);
820         }
821 }
822 /**************************************************************************************************/
823
824 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
825         try{
826                 
827                 ofstream output;
828                 ifstream input;
829                 m->openOutputFileAppend(filename, output);
830                 m->openInputFile(temp, input);
831                 
832                 while(char c = input.get()){
833                         if(input.eof())         {       break;                  }
834                         else                            {       output << c;    }
835                 }
836                 
837                 input.close();
838                 output.close();
839         }
840         catch(exception& e) {
841                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
842                 exit(1);
843         }
844 }
845
846 //**********************************************************************************************************************
847
848 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){
849         try {
850                 ofstream outTax;
851                 m->openOutputFile(taxFName, outTax);
852                 
853                 ofstream outTaxSimple;
854                 m->openOutputFile(tempTFName, outTaxSimple);
855         
856                 ifstream inFASTA;
857                 m->openInputFile(filename, inFASTA);
858                 
859                 string taxonomy;
860
861                 inFASTA.seekg(filePos->start);
862
863                 bool done = false;
864                 int count = 0;
865                 
866                 while (!done) {
867                         if (m->control_pressed) { return 0; }
868                 
869                         Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
870                         
871                         if (candidateSeq->getName() != "") {
872                         
873                                 taxonomy = classify->getTaxonomy(candidateSeq);
874                                 
875                                 if (m->control_pressed) { delete candidateSeq; return 0; }
876
877                                 if (taxonomy != "bad seq") {
878                                         //output confidence scores or not
879                                         if (probs) {
880                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
881                                         }else{
882                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
883                                         }
884                                         
885                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
886                                 }
887                                 count++;
888                         }
889                         delete candidateSeq;
890                         
891                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
892                                 unsigned long int pos = inFASTA.tellg();
893                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
894                         #else
895                                 if (inFASTA.eof()) { break; }
896                         #endif
897                         
898                         //report progress
899                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
900                         
901                 }
902                 //report progress
903                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
904                         
905                 inFASTA.close();
906                 outTax.close();
907                 outTaxSimple.close();
908                 
909                 return count;
910         }
911         catch(exception& e) {
912                 m->errorOut(e, "ClassifySeqsCommand", "driver");
913                 exit(1);
914         }
915 }
916 //**********************************************************************************************************************
917 #ifdef USE_MPI
918 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
919         try {
920                 MPI_Status statusNew; 
921                 MPI_Status statusTemp; 
922                 MPI_Status status; 
923                 
924                 int pid;
925                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
926         
927                 string taxonomy;
928                 string outputString;
929
930                 for(int i=0;i<num;i++){
931                 
932                         if (m->control_pressed) { return 0; }
933                 
934                         //read next sequence
935                         int length = MPIPos[start+i+1] - MPIPos[start+i];
936                         char* buf4 = new char[length];
937                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
938                         
939                         string tempBuf = buf4;
940                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
941                         istringstream iss (tempBuf,istringstream::in);
942                         delete buf4;
943
944                         Sequence* candidateSeq = new Sequence(iss);
945                         
946                         if (candidateSeq->getName() != "") {
947                                 taxonomy = classify->getTaxonomy(candidateSeq);
948                                 
949                                 if (taxonomy != "bad seq") {
950                                         //output confidence scores or not
951                                         if (probs) {
952                                                 outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
953                                         }else{
954                                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
955                                         }
956                                         
957                                         int length = outputString.length();
958                                         char* buf2 = new char[length];
959                                         memcpy(buf2, outputString.c_str(), length);
960                                 
961                                         MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
962                                         delete buf2;
963
964                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
965                                         length = outputString.length();
966                                         char* buf = new char[length];
967                                         memcpy(buf, outputString.c_str(), length);
968                                 
969                                         MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
970                                         delete buf;
971                                 }
972                         }                               
973                         delete candidateSeq;
974                         
975                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
976                 }
977                 
978                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
979                 
980                 
981                 return 1;
982         }
983         catch(exception& e) {
984                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
985                 exit(1);
986         }
987 }
988
989 //**********************************************************************************************************************
990 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
991         try {
992         
993                 nameMap.clear(); //remove old names
994                 
995                 MPI_File inMPI;
996                 MPI_Offset size;
997                 MPI_Status status;
998
999                 //char* inFileName = new char[nameFilename.length()];
1000                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
1001                 
1002                 char inFileName[1024];
1003                 strcpy(inFileName, nameFilename.c_str());
1004
1005                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
1006                 MPI_File_get_size(inMPI, &size);
1007                 //delete inFileName;
1008
1009                 char* buffer = new char[size];
1010                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
1011
1012                 string tempBuf = buffer;
1013                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
1014                 istringstream iss (tempBuf,istringstream::in);
1015                 delete buffer;
1016                 
1017                 string firstCol, secondCol;
1018                 while(!iss.eof()) {
1019                         iss >> firstCol >> secondCol; m->gobble(iss);
1020                         
1021                         vector<string> temp;
1022                         m->splitAtComma(secondCol, temp);
1023                         
1024                         nameMap[firstCol] = temp;  
1025                 }
1026         
1027                 MPI_File_close(&inMPI);
1028                 
1029                 return 1;
1030         }
1031         catch(exception& e) {
1032                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1033                 exit(1);
1034         }
1035 }
1036 #endif
1037 /**************************************************************************************************/