]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
added getCommandInfoCommand for gui
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "phylotree.h"
14 #include "phylosummary.h"
15 #include "knn.h"
16
17
18 //**********************************************************************************************************************
19 vector<string> ClassifySeqsCommand::setParameters(){    
20         try {
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
22                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
23                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
24                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
25                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
26                 CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
27                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
28                 CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
31                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
32                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
33                 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
34                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
35                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
36                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
37                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                 
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ClassifySeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string ClassifySeqsCommand::getHelpString(){    
52         try {
53                 string helpString = "";
54                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
55                 helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
56                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
57                 helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
58                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
59                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
60                 helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
61                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
62                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
63 #ifdef USE_MPI
64                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
65 #endif
66                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
67                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
68                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
69                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
70                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
71                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
72                 helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
73                 helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
74                 helpString += "The classify.seqs command should be in the following format: \n";
75                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
76                 helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n";
77                 helpString += "The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n";
78                 helpString += "The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n";
79                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
80                 return helpString;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ClassifySeqsCommand", "getHelpString");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 ClassifySeqsCommand::ClassifySeqsCommand(){     
89         try {
90                 abort = true; calledHelp = true; 
91                 setParameters();
92                 vector<string> tempOutNames;
93                 outputTypes["taxonomy"] = tempOutNames;
94                 outputTypes["taxsummary"] = tempOutNames;
95                 outputTypes["matchdist"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 
110                 else {
111                         vector<string> myArray = setParameters();
112                         
113                         OptionParser parser(option);
114                         map<string, string> parameters = parser.getParameters(); 
115                         
116                         ValidParameters validParameter("classify.seqs");
117                         map<string, string>::iterator it;
118                         
119                         //check to make sure all parameters are valid for command
120                         for (it = parameters.begin(); it != parameters.end(); it++) { 
121                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
122                         }
123                         
124                         //initialize outputTypes
125                         vector<string> tempOutNames;
126                         outputTypes["taxonomy"] = tempOutNames;
127                         outputTypes["taxsummary"] = tempOutNames;
128                         outputTypes["matchdist"] = tempOutNames;
129                         
130                         //if the user changes the output directory command factory will send this info to us in the output parameter 
131                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
132                         
133                         //if the user changes the input directory command factory will send this info to us in the output parameter 
134                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
135                         if (inputDir == "not found"){   inputDir = "";          }
136                         else {
137                                 string path;
138                                 it = parameters.find("reference");
139                                 //user has given a template file
140                                 if(it != parameters.end()){ 
141                                         path = m->hasPath(it->second);
142                                         //if the user has not given a path then, add inputdir. else leave path alone.
143                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
144                                 }
145                                 
146                                 it = parameters.find("taxonomy");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
152                                 }
153                                 
154                                 it = parameters.find("group");
155                                 //user has given a template file
156                                 if(it != parameters.end()){ 
157                                         path = m->hasPath(it->second);
158                                         //if the user has not given a path then, add inputdir. else leave path alone.
159                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
160                                 }
161                         }
162
163                         //check for required parameters
164                         templateFileName = validParameter.validFile(parameters, "reference", true);
165                         if (templateFileName == "not found") { 
166                                 m->mothurOut("reference is a required parameter for the classify.seqs command."); 
167                                 m->mothurOutEndLine();
168                                 abort = true; 
169                         }
170                         else if (templateFileName == "not open") { abort = true; }      
171                         
172                                                 
173                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
174                         if (fastaFileName == "not found") {                             
175                                 //if there is a current fasta file, use it
176                                 string filename = m->getFastaFile(); 
177                                 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
178                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
179                         }
180                         else { 
181                                 m->splitAtDash(fastaFileName, fastaFileNames);
182                                 
183                                 //go through files and make sure they are good, if not, then disregard them
184                                 for (int i = 0; i < fastaFileNames.size(); i++) {
185                                         if (inputDir != "") {
186                                                 string path = m->hasPath(fastaFileNames[i]);
187                                                 //if the user has not given a path then, add inputdir. else leave path alone.
188                                                 if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
189                                         }
190                                         
191                                         int ableToOpen;
192                                         
193                                         ifstream in;
194                                         ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
195                                 
196                                         //if you can't open it, try default location
197                                         if (ableToOpen == 1) {
198                                                 if (m->getDefaultPath() != "") { //default path is set
199                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
200                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
201                                                         ifstream in2;
202                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
203                                                         in2.close();
204                                                         fastaFileNames[i] = tryPath;
205                                                 }
206                                         }
207                                         
208                                         if (ableToOpen == 1) {
209                                                 if (m->getOutputDir() != "") { //default path is set
210                                                         string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
211                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
212                                                         ifstream in2;
213                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
214                                                         in2.close();
215                                                         fastaFileNames[i] = tryPath;
216                                                 }
217                                         }
218                                         
219                                         in.close();
220                                         
221                                         if (ableToOpen == 1) { 
222                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
223                                                 //erase from file list
224                                                 fastaFileNames.erase(fastaFileNames.begin()+i);
225                                                 i--;
226                                         }
227                                         
228                                 }
229                                 
230                                 //make sure there is at least one valid file left
231                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
232                         }
233
234                         
235                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
236                         if (taxonomyFileName == "not found") { 
237                                 m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
238                                 m->mothurOutEndLine();
239                                 abort = true; 
240                         }
241                         else if (taxonomyFileName == "not open") { abort = true; }      
242                         
243                         
244                         namefile = validParameter.validFile(parameters, "name", false);
245                         if (namefile == "not found") { namefile = "";  }
246
247                         else { 
248                                 m->splitAtDash(namefile, namefileNames);
249                                 
250                                 //go through files and make sure they are good, if not, then disregard them
251                                 for (int i = 0; i < namefileNames.size(); i++) {
252                                         if (inputDir != "") {
253                                                 string path = m->hasPath(namefileNames[i]);
254                                                 //if the user has not given a path then, add inputdir. else leave path alone.
255                                                 if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
256                                         }
257                                         int ableToOpen;
258                                         
259                                         ifstream in;
260                                         ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
261                                 
262                                         //if you can't open it, try default location
263                                         if (ableToOpen == 1) {
264                                                 if (m->getDefaultPath() != "") { //default path is set
265                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
266                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
267                                                         ifstream in2;
268                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
269                                                         in2.close();
270                                                         namefileNames[i] = tryPath;
271                                                 }
272                                         }
273                                         
274                                         if (ableToOpen == 1) {
275                                                 if (m->getOutputDir() != "") { //default path is set
276                                                         string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
277                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
278                                                         ifstream in2;
279                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
280                                                         in2.close();
281                                                         namefileNames[i] = tryPath;
282                                                 }
283                                         }
284                                         in.close();
285                                         
286                                         if (ableToOpen == 1) { 
287                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();  abort = true;
288                                                 //erase from file list
289                                                 namefileNames.erase(namefileNames.begin()+i);
290                                                 i--;
291                                         }
292
293                                 }
294                         }
295
296                         if (namefile != "") {
297                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
298                         }
299                         
300                         groupfile = validParameter.validFile(parameters, "group", false);
301                         if (groupfile == "not found") { groupfile = "";  }
302                         else { 
303                                 m->splitAtDash(groupfile, groupfileNames);
304                                 
305                                 //go through files and make sure they are good, if not, then disregard them
306                                 for (int i = 0; i < groupfileNames.size(); i++) {
307                                         if (inputDir != "") {
308                                                 string path = m->hasPath(groupfileNames[i]);
309                                                 //if the user has not given a path then, add inputdir. else leave path alone.
310                                                 if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
311                                         }
312                                         int ableToOpen;
313                                         
314                                         ifstream in;
315                                         ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
316                                 
317                                         //if you can't open it, try default location
318                                         if (ableToOpen == 1) {
319                                                 if (m->getDefaultPath() != "") { //default path is set
320                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
321                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
322                                                         ifstream in2;
323                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
324                                                         in2.close();
325                                                         groupfileNames[i] = tryPath;
326                                                 }
327                                         }
328                                         
329                                         if (ableToOpen == 1) {
330                                                 if (m->getOutputDir() != "") { //default path is set
331                                                         string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
332                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
333                                                         ifstream in2;
334                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
335                                                         in2.close();
336                                                         groupfileNames[i] = tryPath;
337                                                 }
338                                         }
339                                         
340                                         in.close();
341                                         
342                                         if (ableToOpen == 1) { 
343                                                 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
344                                                 //erase from file list
345                                                 groupfileNames.erase(groupfileNames.begin()+i);
346                                                 i--;
347                                         }
348                                 }
349                         }
350
351                         if (groupfile != "") {
352                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
353                         }else {
354                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
355                         }
356                         
357                         //check for optional parameter and set defaults
358                         // ...at some point should added some additional type checking...
359                         string temp;
360                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
361                         convert(temp, kmerSize); 
362                         
363                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
364                         m->setProcessors(temp);
365                         convert(temp, processors); 
366                         
367                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
368                         
369                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
370                         
371                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
372                         convert(temp, match);  
373                         
374                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
375                         convert(temp, misMatch);  
376                         
377                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
378                         convert(temp, gapOpen);  
379                         
380                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
381                         convert(temp, gapExtend); 
382                         
383                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
384                         convert(temp, numWanted);
385                         
386                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
387                         convert(temp, cutoff);
388                         
389                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
390                         probs = m->isTrue(temp);
391                         
392                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
393                         convert(temp, iters); 
394
395
396                         
397                         if ((method == "bayesian") && (search != "kmer"))  { 
398                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
399                                 search = "kmer";
400                         }
401                 }
402                 
403         }
404         catch(exception& e) {
405                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
406                 exit(1);
407         }
408 }
409
410 //**********************************************************************************************************************
411 ClassifySeqsCommand::~ClassifySeqsCommand(){    
412         if (abort == false) {
413                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
414         }
415 }
416 //**********************************************************************************************************************
417
418 int ClassifySeqsCommand::execute(){
419         try {
420                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
421                 
422                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
423                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
424                 else {
425                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
426                         m->mothurOutEndLine();
427                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
428                 }
429                 
430                 if (m->control_pressed) { delete classify; return 0; }
431                 
432                                 
433                 for (int s = 0; s < fastaFileNames.size(); s++) {
434                 
435                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
436                         
437                         string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName));
438                         RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
439                         if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
440                         RippedTaxName +=  "."; 
441                 
442                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
443                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
444                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
445                         string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
446                         
447                         if ((method == "knn") && (search == "distance")) { 
448                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
449                                 classify->setDistName(DistName);  outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
450                         }
451                         
452                         outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
453                         outputNames.push_back(taxSummary);      outputTypes["taxsummary"].push_back(taxSummary);
454                         
455                         int start = time(NULL);
456                         int numFastaSeqs = 0;
457                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
458                         
459 #ifdef USE_MPI  
460                                 int pid, numSeqsPerProcessor; 
461                                 int tag = 2001;
462                                 vector<unsigned long int> MPIPos;
463                                 
464                                 MPI_Status status; 
465                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
466                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
467
468                                 MPI_File inMPI;
469                                 MPI_File outMPINewTax;
470                                 MPI_File outMPITempTax;
471                                                         
472                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
473                                 int inMode=MPI_MODE_RDONLY; 
474                                 
475                                 char outNewTax[1024];
476                                 strcpy(outNewTax, newTaxonomyFile.c_str());
477                                 
478                                 char outTempTax[1024];
479                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
480                                 
481                                 char inFileName[1024];
482                                 strcpy(inFileName, fastaFileNames[s].c_str());
483
484                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
485                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
486                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
487                                 
488                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
489                                 
490                                 if (pid == 0) { //you are the root process 
491                                         
492                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
493                                         
494                                         //send file positions to all processes
495                                         for(int i = 1; i < processors; i++) { 
496                                                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497                                                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
498                                         }
499                                         
500                                         //figure out how many sequences you have to align
501                                         numSeqsPerProcessor = numFastaSeqs / processors;
502                                         int startIndex =  pid * numSeqsPerProcessor;
503                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
504                                         
505                                 
506                                         //align your part
507                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
508                                         
509                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0;  }
510                                         
511                                         for (int i = 1; i < processors; i++) {
512                                                 int done;
513                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
514                                         }
515                                 }else{ //you are a child process
516                                         MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
517                                         MPIPos.resize(numFastaSeqs+1);
518                                         MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
519                                         
520                                         //figure out how many sequences you have to align
521                                         numSeqsPerProcessor = numFastaSeqs / processors;
522                                         int startIndex =  pid * numSeqsPerProcessor;
523                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
524                                         
525                                         
526                                         //align your part
527                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
528                                         
529                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
530
531                                         int done = 0;
532                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
533                                 }
534                                 
535                                 //close files 
536                                 MPI_File_close(&inMPI);
537                                 MPI_File_close(&outMPINewTax);
538                                 MPI_File_close(&outMPITempTax);
539                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
540                                 
541 #else
542                 
543                         vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
544                                 
545                         for (int i = 0; i < (positions.size()-1); i++) {
546                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
547                         }       
548                         
549                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
550                         if(processors == 1){
551                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
552                         }
553                         else{
554                                 processIDS.resize(0);
555                                 
556                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
557                                 
558                         }
559         #else
560                         numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
561         #endif  
562 #endif
563
564                 m->mothurOutEndLine();
565                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
566                 start = time(NULL);
567
568
569                 #ifdef USE_MPI  
570                         if (pid == 0) {  //this part does not need to be paralellized
571                         
572                                 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();  MPIReadNamesFile(namefileNames[s]);  m->mothurOut("  Done."); m->mothurOutEndLine(); }
573                 #else
574                         //read namefile
575                         if(namefile != "") {
576                         
577                             m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
578                                 
579                                 nameMap.clear(); //remove old names
580                                 
581                                 ifstream inNames;
582                                 m->openInputFile(namefileNames[s], inNames);
583                                 
584                                 string firstCol, secondCol;
585                                 while(!inNames.eof()) {
586                                         inNames >> firstCol >> secondCol; m->gobble(inNames);
587                                         
588                                         vector<string> temp;
589                                         m->splitAtComma(secondCol, temp);
590                         
591                                         nameMap[firstCol] = temp;  
592                                 }
593                                 inNames.close();
594                                 
595                                 m->mothurOut("  Done."); m->mothurOutEndLine();
596                         }
597                 #endif
598
599                         string group = "";
600                         if (groupfile != "") {  group = groupfileNames[s]; }
601                         
602                         PhyloSummary taxaSum(taxonomyFileName, group);
603                         
604                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
605                 
606                         if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
607                         else {
608                                 ifstream in;
609                                 m->openInputFile(tempTaxonomyFile, in);
610                                 
611                                 //read in users taxonomy file and add sequences to tree
612                                 string name, taxon;
613                                 
614                                 while(!in.eof()){
615                                         in >> name >> taxon; m->gobble(in);
616                                         
617                                         itNames = nameMap.find(name);
618                 
619                                         if (itNames == nameMap.end()) { 
620                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
621                                         }else{
622                                                 for (int i = 0; i < itNames->second.size(); i++) { 
623                                                         taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
624                                                 }
625                                                 itNames->second.clear();
626                                                 nameMap.erase(itNames->first);
627                                         }
628                                 }
629                                 in.close();
630                         }
631                         remove(tempTaxonomyFile.c_str());
632                         
633                         if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
634                         
635                         //print summary file
636                         ofstream outTaxTree;
637                         m->openOutputFile(taxSummary, outTaxTree);
638                         taxaSum.print(outTaxTree);
639                         outTaxTree.close();
640                         
641                         //output taxonomy with the unclassified bins added
642                         ifstream inTax;
643                         m->openInputFile(newTaxonomyFile, inTax);
644                         
645                         ofstream outTax;
646                         string unclass = newTaxonomyFile + ".unclass.temp";
647                         m->openOutputFile(unclass, outTax);
648                         
649                         //get maxLevel from phylotree so you know how many 'unclassified's to add
650                         int maxLevel = taxaSum.getMaxLevel();
651                                                         
652                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
653                         string name, taxon;
654                         while (!inTax.eof()) {
655                                 if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
656
657                                 inTax >> name >> taxon; m->gobble(inTax);
658                                 
659                                 string newTax = addUnclassifieds(taxon, maxLevel);
660                                 
661                                 outTax << name << '\t' << newTax << endl;
662                         }
663                         inTax.close();  
664                         outTax.close();
665                         
666                         remove(newTaxonomyFile.c_str());
667                         rename(unclass.c_str(), newTaxonomyFile.c_str());
668                         
669                         m->mothurOutEndLine();
670                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
671                         
672                         #ifdef USE_MPI  
673                                 }
674                         #endif
675
676                         m->mothurOutEndLine();
677                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
678                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
679                         m->mothurOutEndLine();
680                 }
681                 
682                 //set taxonomy file as new current taxonomyfile
683                 string current = "";
684                 itTypes = outputTypes.find("taxonomy");
685                 if (itTypes != outputTypes.end()) {
686                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
687                 }
688                 
689                 delete classify;
690                 return 0;
691         }
692         catch(exception& e) {
693                 m->errorOut(e, "ClassifySeqsCommand", "execute");
694                 exit(1);
695         }
696 }
697
698 /**************************************************************************************************/
699 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
700         try{
701                 string newTax, taxon;
702                 int level = 0;
703                 
704                 //keep what you have counting the levels
705                 while (tax.find_first_of(';') != -1) {
706                         //get taxon
707                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
708                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
709                         newTax += taxon;
710                         level++;
711                 }
712                 
713                 //add "unclassified" until you reach maxLevel
714                 while (level < maxlevel) {
715                         newTax += "unclassified;";
716                         level++;
717                 }
718                 
719                 return newTax;
720         }
721         catch(exception& e) {
722                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
723                 exit(1);
724         }
725 }
726
727 /**************************************************************************************************/
728
729 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
730         try {
731 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
732                 int process = 1;
733                 int num = 0;
734                 
735                 //loop through and create all the processes you want
736                 while (process != processors) {
737                         int pid = fork();
738                         
739                         if (pid > 0) {
740                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
741                                 process++;
742                         }else if (pid == 0){
743                                 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
744                                 
745                                 //pass numSeqs to parent
746                                 ofstream out;
747                                 string tempFile = filename + toString(getpid()) + ".num.temp";
748                                 m->openOutputFile(tempFile, out);
749                                 out << num << endl;
750                                 out.close();
751
752                                 exit(0);
753                         }else { 
754                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
755                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
756                                 exit(0);
757                         }
758                 }
759                 
760                 //parent does its part
761                 num = driver(lines[0], taxFileName, tempTaxFile, filename);
762                 
763                 //force parent to wait until all the processes are done
764                 for (int i=0;i<processIDS.size();i++) { 
765                         int temp = processIDS[i];
766                         wait(&temp);
767                 }
768                 
769                 for (int i = 0; i < processIDS.size(); i++) {
770                         ifstream in;
771                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
772                         m->openInputFile(tempFile, in);
773                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
774                         in.close(); remove(tempFile.c_str());
775                 }
776                 
777                 for(int i=0;i<processIDS.size();i++){
778                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
779                         appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
780                         remove((taxFileName + toString(processIDS[i]) + ".temp").c_str());
781                         remove((tempTaxFile + toString(processIDS[i]) + ".temp").c_str());
782                 }
783                 
784                 return num;
785 #endif          
786         }
787         catch(exception& e) {
788                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
789                 exit(1);
790         }
791 }
792 /**************************************************************************************************/
793
794 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
795         try{
796                 
797                 ofstream output;
798                 ifstream input;
799                 m->openOutputFileAppend(filename, output);
800                 m->openInputFile(temp, input);
801                 
802                 while(char c = input.get()){
803                         if(input.eof())         {       break;                  }
804                         else                            {       output << c;    }
805                 }
806                 
807                 input.close();
808                 output.close();
809         }
810         catch(exception& e) {
811                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
812                 exit(1);
813         }
814 }
815
816 //**********************************************************************************************************************
817
818 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){
819         try {
820                 ofstream outTax;
821                 m->openOutputFile(taxFName, outTax);
822                 
823                 ofstream outTaxSimple;
824                 m->openOutputFile(tempTFName, outTaxSimple);
825         
826                 ifstream inFASTA;
827                 m->openInputFile(filename, inFASTA);
828                 
829                 string taxonomy;
830
831                 inFASTA.seekg(filePos->start);
832
833                 bool done = false;
834                 int count = 0;
835                 
836                 while (!done) {
837                         if (m->control_pressed) { return 0; }
838                 
839                         Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
840                         
841                         if (candidateSeq->getName() != "") {
842                         
843                                 taxonomy = classify->getTaxonomy(candidateSeq);
844                                 
845                                 if (m->control_pressed) { delete candidateSeq; return 0; }
846
847                                 if (taxonomy != "bad seq") {
848                                         //output confidence scores or not
849                                         if (probs) {
850                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
851                                         }else{
852                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
853                                         }
854                                         
855                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
856                                 }
857                                 count++;
858                         }
859                         delete candidateSeq;
860                         
861                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
862                                 unsigned long int pos = inFASTA.tellg();
863                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
864                         #else
865                                 if (inFASTA.eof()) { break; }
866                         #endif
867                         
868                         //report progress
869                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
870                         
871                 }
872                 //report progress
873                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
874                         
875                 inFASTA.close();
876                 outTax.close();
877                 outTaxSimple.close();
878                 
879                 return count;
880         }
881         catch(exception& e) {
882                 m->errorOut(e, "ClassifySeqsCommand", "driver");
883                 exit(1);
884         }
885 }
886 //**********************************************************************************************************************
887 #ifdef USE_MPI
888 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
889         try {
890                 MPI_Status statusNew; 
891                 MPI_Status statusTemp; 
892                 MPI_Status status; 
893                 
894                 int pid;
895                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
896         
897                 string taxonomy;
898                 string outputString;
899
900                 for(int i=0;i<num;i++){
901                 
902                         if (m->control_pressed) { return 0; }
903                 
904                         //read next sequence
905                         int length = MPIPos[start+i+1] - MPIPos[start+i];
906                         char* buf4 = new char[length];
907                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
908                         
909                         string tempBuf = buf4;
910                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
911                         istringstream iss (tempBuf,istringstream::in);
912                         delete buf4;
913
914                         Sequence* candidateSeq = new Sequence(iss);
915                         
916                         if (candidateSeq->getName() != "") {
917                                 taxonomy = classify->getTaxonomy(candidateSeq);
918                                 
919                                 if (taxonomy != "bad seq") {
920                                         //output confidence scores or not
921                                         if (probs) {
922                                                 outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
923                                         }else{
924                                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
925                                         }
926                                         
927                                         int length = outputString.length();
928                                         char* buf2 = new char[length];
929                                         memcpy(buf2, outputString.c_str(), length);
930                                 
931                                         MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
932                                         delete buf2;
933
934                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
935                                         length = outputString.length();
936                                         char* buf = new char[length];
937                                         memcpy(buf, outputString.c_str(), length);
938                                 
939                                         MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
940                                         delete buf;
941                                 }
942                         }                               
943                         delete candidateSeq;
944                         
945                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
946                 }
947                 
948                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
949                 
950                 
951                 return 1;
952         }
953         catch(exception& e) {
954                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
955                 exit(1);
956         }
957 }
958
959 //**********************************************************************************************************************
960 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
961         try {
962         
963                 nameMap.clear(); //remove old names
964                 
965                 MPI_File inMPI;
966                 MPI_Offset size;
967                 MPI_Status status;
968
969                 //char* inFileName = new char[nameFilename.length()];
970                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
971                 
972                 char inFileName[1024];
973                 strcpy(inFileName, nameFilename.c_str());
974
975                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
976                 MPI_File_get_size(inMPI, &size);
977                 //delete inFileName;
978
979                 char* buffer = new char[size];
980                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
981
982                 string tempBuf = buffer;
983                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
984                 istringstream iss (tempBuf,istringstream::in);
985                 delete buffer;
986                 
987                 string firstCol, secondCol;
988                 while(!iss.eof()) {
989                         iss >> firstCol >> secondCol; m->gobble(iss);
990                         
991                         vector<string> temp;
992                         m->splitAtComma(secondCol, temp);
993                         
994                         nameMap[firstCol] = temp;  
995                 }
996         
997                 MPI_File_close(&inMPI);
998                 
999                 return 1;
1000         }
1001         catch(exception& e) {
1002                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1003                 exit(1);
1004         }
1005 }
1006 #endif
1007 /**************************************************************************************************/