]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
fixed classify.seqs output file name - had issue if reference taxonomy file did not...
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11
12
13
14 //**********************************************************************************************************************
15 vector<string> ClassifySeqsCommand::setParameters(){    
16         try {
17                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
18                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
19                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
20                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
22                 CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
23                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
24                 CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
25                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
26                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
27                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
28                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
29                 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
30                 //CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
31                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
32                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
33                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
34                 CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave);
35                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
36                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
37                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
38                 
39                 vector<string> myArray;
40                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
41                 return myArray;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ClassifySeqsCommand", "setParameters");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49 string ClassifySeqsCommand::getHelpString(){    
50         try {
51                 string helpString = "";
52                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
53                 helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
54                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
55                 helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
56                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
57                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
58                 helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
59                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
60                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
61 #ifdef USE_MPI
62                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
63 #endif
64                 helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f.";
65                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
66                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
67                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
68                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
69                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
70                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
71                 helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
72                 helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
73                 //helpString += "The flip parameter allows you shut off mothur's   The default is T.\n";
74                 helpString += "The classify.seqs command should be in the following format: \n";
75                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
76                 helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n";
77                 helpString += "The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n";
78                 helpString += "The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n";
79                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
80                 return helpString;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ClassifySeqsCommand", "getHelpString");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 ClassifySeqsCommand::ClassifySeqsCommand(){     
89         try {
90                 abort = true; calledHelp = true; 
91                 setParameters();
92                 vector<string> tempOutNames;
93                 outputTypes["taxonomy"] = tempOutNames;
94                 outputTypes["accnos"] = tempOutNames;
95                 outputTypes["taxsummary"] = tempOutNames;
96                 outputTypes["matchdist"] = tempOutNames;
97         }
98         catch(exception& e) {
99                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
100                 exit(1);
101         }
102 }
103 //**********************************************************************************************************************
104 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
105         try {
106                 abort = false; calledHelp = false;   
107                 rdb = ReferenceDB::getInstance();
108                 
109                 //allow user to run help
110                 if(option == "help") { help(); abort = true; calledHelp = true; }
111                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
112                 
113                 else {
114                         vector<string> myArray = setParameters();
115                         
116                         OptionParser parser(option);
117                         map<string, string> parameters = parser.getParameters(); 
118                         
119                         ValidParameters validParameter("classify.seqs");
120                         map<string, string>::iterator it;
121                         
122                         //check to make sure all parameters are valid for command
123                         for (it = parameters.begin(); it != parameters.end(); it++) { 
124                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
125                         }
126                         
127                         //initialize outputTypes
128                         vector<string> tempOutNames;
129                         outputTypes["taxonomy"] = tempOutNames;
130                         outputTypes["taxsummary"] = tempOutNames;
131                         outputTypes["matchdist"] = tempOutNames;
132                         outputTypes["accnos"] = tempOutNames;
133                         
134                         //if the user changes the output directory command factory will send this info to us in the output parameter 
135                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
136                         
137                         //if the user changes the input directory command factory will send this info to us in the output parameter 
138                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
139                         if (inputDir == "not found"){   inputDir = "";          }
140                         else {
141                                 string path;
142                                 it = parameters.find("reference");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
148                                 }
149                                 
150                                 it = parameters.find("taxonomy");
151                                 //user has given a template file
152                                 if(it != parameters.end()){ 
153                                         path = m->hasPath(it->second);
154                                         //if the user has not given a path then, add inputdir. else leave path alone.
155                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
156                                 }
157                                 
158                                 it = parameters.find("group");
159                                 //user has given a template file
160                                 if(it != parameters.end()){ 
161                                         path = m->hasPath(it->second);
162                                         //if the user has not given a path then, add inputdir. else leave path alone.
163                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
164                                 }
165                         }
166
167                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
168                         if (fastaFileName == "not found") {                             
169                                 //if there is a current fasta file, use it
170                                 string filename = m->getFastaFile(); 
171                                 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
172                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
173                         }
174                         else { 
175                                 m->splitAtDash(fastaFileName, fastaFileNames);
176                                 
177                                 //go through files and make sure they are good, if not, then disregard them
178                                 for (int i = 0; i < fastaFileNames.size(); i++) {
179                                         
180                                         bool ignore = false;
181                                         if (fastaFileNames[i] == "current") { 
182                                                 fastaFileNames[i] = m->getFastaFile(); 
183                                                 if (fastaFileNames[i] != "") {  m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
184                                                 else {  
185                                                         m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
186                                                         //erase from file list
187                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
188                                                         i--;
189                                                 }
190                                         }
191                                         
192                                         if (!ignore) {
193                                                 
194                                                 if (inputDir != "") {
195                                                         string path = m->hasPath(fastaFileNames[i]);
196                                                         //if the user has not given a path then, add inputdir. else leave path alone.
197                                                         if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
198                                                 }
199                                                 
200                                                 int ableToOpen;
201                                                 
202                                                 ifstream in;
203                                                 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
204                                         
205                                                 //if you can't open it, try default location
206                                                 if (ableToOpen == 1) {
207                                                         if (m->getDefaultPath() != "") { //default path is set
208                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
209                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
210                                                                 ifstream in2;
211                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
212                                                                 in2.close();
213                                                                 fastaFileNames[i] = tryPath;
214                                                         }
215                                                 }
216                                                 
217                                                 if (ableToOpen == 1) {
218                                                         if (m->getOutputDir() != "") { //default path is set
219                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
220                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
221                                                                 ifstream in2;
222                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
223                                                                 in2.close();
224                                                                 fastaFileNames[i] = tryPath;
225                                                         }
226                                                 }
227                                                 
228                                                 in.close();
229                                                 
230                                                 if (ableToOpen == 1) { 
231                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
232                                                         //erase from file list
233                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
234                                                         i--;
235                                                 }else {
236                                                         m->setFastaFile(fastaFileNames[i]);
237                                                 }
238                                         }
239                                         
240                                 }
241                                 
242                                 //make sure there is at least one valid file left
243                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
244                         }
245
246                         namefile = validParameter.validFile(parameters, "name", false);
247                         if (namefile == "not found") { namefile = "";  }
248
249                         else { 
250                                 m->splitAtDash(namefile, namefileNames);
251                                 
252                                 //go through files and make sure they are good, if not, then disregard them
253                                 for (int i = 0; i < namefileNames.size(); i++) {
254                                         bool ignore = false;
255                                         if (namefileNames[i] == "current") { 
256                                                 namefileNames[i] = m->getNameFile(); 
257                                                 if (namefileNames[i] != "") {  m->mothurOut("Using " + namefileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
258                                                 else {  
259                                                         m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
260                                                         //erase from file list
261                                                         namefileNames.erase(namefileNames.begin()+i);
262                                                         i--;
263                                                 }
264                                         }
265                                         
266                                         if (!ignore) {
267                                                 
268                                                 if (inputDir != "") {
269                                                         string path = m->hasPath(namefileNames[i]);
270                                                         //if the user has not given a path then, add inputdir. else leave path alone.
271                                                         if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
272                                                 }
273                                                 int ableToOpen;
274                                                 
275                                                 ifstream in;
276                                                 ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
277                                         
278                                                 //if you can't open it, try default location
279                                                 if (ableToOpen == 1) {
280                                                         if (m->getDefaultPath() != "") { //default path is set
281                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
282                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
283                                                                 ifstream in2;
284                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
285                                                                 in2.close();
286                                                                 namefileNames[i] = tryPath;
287                                                         }
288                                                 }
289                                                 
290                                                 if (ableToOpen == 1) {
291                                                         if (m->getOutputDir() != "") { //default path is set
292                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
293                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
294                                                                 ifstream in2;
295                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
296                                                                 in2.close();
297                                                                 namefileNames[i] = tryPath;
298                                                         }
299                                                 }
300                                                 in.close();
301                                                 
302                                                 if (ableToOpen == 1) { 
303                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();  abort = true;
304                                                         //erase from file list
305                                                         namefileNames.erase(namefileNames.begin()+i);
306                                                         i--;
307                                                 }else {
308                                                         m->setNameFile(namefileNames[i]);
309                                                 }
310                                         }
311                                 }
312                         }
313
314                         if (namefile != "") {
315                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
316                         }
317                         
318                         groupfile = validParameter.validFile(parameters, "group", false);
319                         if (groupfile == "not found") { groupfile = "";  }
320                         else { 
321                                 m->splitAtDash(groupfile, groupfileNames);
322                                 
323                                 //go through files and make sure they are good, if not, then disregard them
324                                 for (int i = 0; i < groupfileNames.size(); i++) {
325                                         if (inputDir != "") {
326                                                 string path = m->hasPath(groupfileNames[i]);
327                                                 //if the user has not given a path then, add inputdir. else leave path alone.
328                                                 if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
329                                         }
330                                         int ableToOpen;
331                                         
332                                         ifstream in;
333                                         ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
334                                 
335                                         //if you can't open it, try default location
336                                         if (ableToOpen == 1) {
337                                                 if (m->getDefaultPath() != "") { //default path is set
338                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
339                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
340                                                         ifstream in2;
341                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
342                                                         in2.close();
343                                                         groupfileNames[i] = tryPath;
344                                                 }
345                                         }
346                                         
347                                         if (ableToOpen == 1) {
348                                                 if (m->getOutputDir() != "") { //default path is set
349                                                         string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
350                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
351                                                         ifstream in2;
352                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
353                                                         in2.close();
354                                                         groupfileNames[i] = tryPath;
355                                                 }
356                                         }
357                                         
358                                         in.close();
359                                         
360                                         if (ableToOpen == 1) { 
361                                                 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
362                                                 //erase from file list
363                                                 groupfileNames.erase(groupfileNames.begin()+i);
364                                                 i--;
365                                         }else {
366                                                 m->setGroupFile(groupfileNames[i]);
367                                         }
368                                 }
369                         }
370
371                         if (groupfile != "") {
372                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
373                         }else {
374                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
375                         }
376                         
377                         //check for optional parameter and set defaults
378                         // ...at some point should added some additional type checking...
379                         string temp;
380                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
381                         m->mothurConvert(temp, kmerSize); 
382                         
383                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
384                         m->setProcessors(temp);
385                         m->mothurConvert(temp, processors); 
386                         
387                         temp = validParameter.validFile(parameters, "save", false);                     if (temp == "not found"){       temp = "f";                             }
388                         save = m->isTrue(temp); 
389                         rdb->save = save; 
390                         if (save) { //clear out old references
391                                 rdb->clearMemory();     
392                         }
393                         
394                         //this has to go after save so that if the user sets save=t and provides no reference we abort
395                         templateFileName = validParameter.validFile(parameters, "reference", true);
396                         if (templateFileName == "not found") { 
397                                 //check for saved reference sequences
398                                 if (rdb->referenceSeqs.size() != 0) {
399                                         templateFileName = "saved";
400                                 }else {
401                                         m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required for the classify.seqs command."); 
402                                         m->mothurOutEndLine();
403                                         abort = true; 
404                                 }
405                         }else if (templateFileName == "not open") { abort = true; }     
406                         else {  if (save) {     rdb->setSavedReference(templateFileName);       }       }
407                         
408                         //this has to go after save so that if the user sets save=t and provides no reference we abort
409                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
410                         if (taxonomyFileName == "not found") { 
411                                 //check for saved reference sequences
412                                 if (rdb->wordGenusProb.size() != 0) {
413                                         taxonomyFileName = "saved";
414                                 }else {
415                                         m->mothurOut("[ERROR]: You don't have any saved taxonomy information and the taxonomy parameter is a required for the classify.seqs command."); 
416                                         m->mothurOutEndLine();
417                                         abort = true; 
418                                 }
419                         }else if (taxonomyFileName == "not open") { abort = true; }     
420                         else {  if (save) {     rdb->setSavedTaxonomy(taxonomyFileName);        }       }
421                         
422                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
423                         
424                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
425                         
426                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
427                         m->mothurConvert(temp, match);  
428                         
429                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
430                         m->mothurConvert(temp, misMatch);  
431                         
432                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
433                         m->mothurConvert(temp, gapOpen);  
434                         
435                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
436                         m->mothurConvert(temp, gapExtend); 
437                         
438                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
439                         m->mothurConvert(temp, numWanted);
440                         
441                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
442                         m->mothurConvert(temp, cutoff);
443                         
444                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
445                         probs = m->isTrue(temp);
446                         
447                         //temp = validParameter.validFile(parameters, "flip", false);                   if (temp == "not found"){       temp = "T";                             }
448                         //flip = m->isTrue(temp); 
449                         flip = true;
450                         
451                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
452                         m->mothurConvert(temp, iters); 
453
454                         
455                         if ((method == "bayesian") && (search != "kmer"))  { 
456                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
457                                 search = "kmer";
458                         }
459                         
460             if (!abort) {
461                 if (namefileNames.size() == 0){
462                     if (fastaFileNames.size() != 0) {
463                         vector<string> files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); 
464                         parser.getNameFile(files);
465                     }
466                 }
467             }
468         }
469         }
470         catch(exception& e) {
471                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
472                 exit(1);
473         }
474 }
475
476 //**********************************************************************************************************************
477 ClassifySeqsCommand::~ClassifySeqsCommand(){    
478         if (abort == false) {
479                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
480         }
481 }
482 //**********************************************************************************************************************
483
484 int ClassifySeqsCommand::execute(){
485         try {
486                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
487         
488                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip);             }
489                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand());                               }
490                 else {
491                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
492                         m->mothurOutEndLine();
493                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip);     
494                 }
495                 
496                 if (m->control_pressed) { delete classify; return 0; }
497                                 
498                 for (int s = 0; s < fastaFileNames.size(); s++) {
499                 
500                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
501                         
502                         string baseTName = taxonomyFileName;
503                         if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy();  }
504                         
505             //set rippedTaxName to 
506                         string RippedTaxName = "";
507             bool foundDot = false;
508             for (int i = baseTName.length()-1; i >= 0; i--) {
509                 cout << baseTName[i] << endl;
510                 if (foundDot && (baseTName[i] != '.')) {  RippedTaxName = baseTName[i] + RippedTaxName; }
511                 else if (foundDot && (baseTName[i] == '.')) {  break; }
512                 else if (!foundDot && (baseTName[i] == '.')) {  foundDot = true; }
513             }
514             if (RippedTaxName != "") { RippedTaxName +=  "."; }   
515           
516                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
517                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
518                         string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "flip.accnos";
519                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
520                         string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
521                         
522                         if ((method == "knn") && (search == "distance")) { 
523                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
524                                 classify->setDistName(DistName);  outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
525                         }
526                         
527                         outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
528                         outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile);
529                         outputNames.push_back(taxSummary);      outputTypes["taxsummary"].push_back(taxSummary);
530                         
531                         int start = time(NULL);
532                         int numFastaSeqs = 0;
533                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
534                         
535 #ifdef USE_MPI  
536                                 int pid, numSeqsPerProcessor; 
537                                 int tag = 2001;
538                                 vector<unsigned long long> MPIPos;
539                                 
540                                 MPI_Status status; 
541                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
542                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
543
544                                 MPI_File inMPI;
545                                 MPI_File outMPINewTax;
546                                 MPI_File outMPITempTax;
547                                 MPI_File outMPIAcc;
548                                                         
549                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
550                                 int inMode=MPI_MODE_RDONLY; 
551                                 
552                                 char outNewTax[1024];
553                                 strcpy(outNewTax, newTaxonomyFile.c_str());
554                                 
555                                 char outTempTax[1024];
556                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
557                         
558                                 char outAcc[1024];
559                                 strcpy(outAcc, newaccnosFile.c_str());
560                                 
561                                 char inFileName[1024];
562                                 strcpy(inFileName, fastaFileNames[s].c_str());
563
564                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
565                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
566                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
567                                 MPI_File_open(MPI_COMM_WORLD, outAcc, outMode, MPI_INFO_NULL, &outMPIAcc);
568                                 
569                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);  MPI_File_close(&outMPIAcc);   MPI_File_close(&outMPITempTax);  delete classify;  return 0;  }
570                                 
571                                 if (pid == 0) { //you are the root process 
572                                         
573                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
574                                         
575                                         //send file positions to all processes
576                                         for(int i = 1; i < processors; i++) { 
577                                                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
578                                                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
579                                         }
580                                         
581                                         //figure out how many sequences you have to align
582                                         numSeqsPerProcessor = numFastaSeqs / processors;
583                                         int startIndex =  pid * numSeqsPerProcessor;
584                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
585                                         
586                                 
587                                         //align your part
588                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos);
589                                         
590                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        } delete classify; return 0;  }
591                                         
592                                         for (int i = 1; i < processors; i++) {
593                                                 int done;
594                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
595                                         }
596                                 }else{ //you are a child process
597                                         MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
598                                         MPIPos.resize(numFastaSeqs+1);
599                                         MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
600                                         
601                                         //figure out how many sequences you have to align
602                                         numSeqsPerProcessor = numFastaSeqs / processors;
603                                         int startIndex =  pid * numSeqsPerProcessor;
604                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
605                                         
606                                         
607                                         //align your part
608                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos);
609                                         
610                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);  MPI_File_close(&outMPIAcc);  MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
611
612                                         int done = 0;
613                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
614                                 }
615                                 
616                                 //close files 
617                                 MPI_File_close(&inMPI);
618                                 MPI_File_close(&outMPINewTax);
619                                 MPI_File_close(&outMPITempTax);
620                                 MPI_File_close(&outMPIAcc); 
621                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
622                                 
623 #else
624                 
625                         vector<unsigned long long> positions; 
626 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
627                         positions = m->divideFile(fastaFileNames[s], processors);
628                         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
629 #else
630                         if (processors == 1) {
631                                 lines.push_back(new linePair(0, 1000));
632                         }else {
633                                 positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); 
634                 if (positions.size() < processors) { processors = positions.size(); }
635                                 
636                                 //figure out how many sequences you have to process
637                                 int numSeqsPerProcessor = numFastaSeqs / processors;
638                                 for (int i = 0; i < processors; i++) {
639                                         int startIndex =  i * numSeqsPerProcessor;
640                                         if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
641                                         lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
642                                 }
643                         }
644 #endif
645                         if(processors == 1){
646                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]);
647                         }else{
648                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); 
649                         }
650 #endif
651                         
652                         if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); }
653
654                 m->mothurOutEndLine();
655                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
656                 start = time(NULL);
657                 
658                 
659
660                 #ifdef USE_MPI  
661                         if (pid == 0) {  //this part does not need to be paralellized
662                         
663                                 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();  MPIReadNamesFile(namefileNames[s]);  m->mothurOut("  Done."); m->mothurOutEndLine(); }
664                 #else
665                         //read namefile
666                         if(namefile != "") {
667                         
668                             m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
669                                 
670                                 nameMap.clear(); //remove old names
671                                 
672                                 ifstream inNames;
673                                 m->openInputFile(namefileNames[s], inNames);
674                                 
675                                 string firstCol, secondCol;
676                                 while(!inNames.eof()) {
677                                         inNames >> firstCol >> secondCol; m->gobble(inNames);
678                                         
679                                         vector<string> temp;
680                                         m->splitAtComma(secondCol, temp);
681                         
682                                         nameMap[firstCol] = temp;  
683                                 }
684                                 inNames.close();
685                                 
686                                 m->mothurOut("  Done."); m->mothurOutEndLine();
687                         }
688                 #endif
689
690                         string group = "";
691                         if (groupfile != "") {  group = groupfileNames[s]; }
692                         
693                         PhyloSummary taxaSum(baseTName, group);
694                         
695                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
696                 
697                         if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
698                         else {
699                                 ifstream in;
700                                 m->openInputFile(tempTaxonomyFile, in);
701                                 
702                                 //read in users taxonomy file and add sequences to tree
703                                 string name, taxon;
704                                 
705                                 while(!in.eof()){
706                                         in >> name >> taxon; m->gobble(in);
707                                         
708                                         itNames = nameMap.find(name);
709                 
710                                         if (itNames == nameMap.end()) { 
711                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
712                                         }else{
713                                                 for (int i = 0; i < itNames->second.size(); i++) { 
714                                                         taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
715                                                 }
716                                                 itNames->second.clear();
717                                                 nameMap.erase(itNames->first);
718                                         }
719                                 }
720                                 in.close();
721                         }
722                         m->mothurRemove(tempTaxonomyFile);
723                         
724                         if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
725                         
726                         //print summary file
727                         ofstream outTaxTree;
728                         m->openOutputFile(taxSummary, outTaxTree);
729                         taxaSum.print(outTaxTree);
730                         outTaxTree.close();
731                         
732                         //output taxonomy with the unclassified bins added
733                         ifstream inTax;
734                         m->openInputFile(newTaxonomyFile, inTax);
735                         
736                         ofstream outTax;
737                         string unclass = newTaxonomyFile + ".unclass.temp";
738                         m->openOutputFile(unclass, outTax);
739                         
740                         //get maxLevel from phylotree so you know how many 'unclassified's to add
741                         int maxLevel = taxaSum.getMaxLevel();
742                                                         
743                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
744                         string name, taxon;
745                         while (!inTax.eof()) {
746                                 if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } m->mothurRemove(unclass); delete classify; return 0; }
747
748                                 inTax >> name >> taxon; m->gobble(inTax);
749                                 
750                                 string newTax = addUnclassifieds(taxon, maxLevel);
751                                 
752                                 outTax << name << '\t' << newTax << endl;
753                         }
754                         inTax.close();  
755                         outTax.close();
756                         
757                         m->mothurRemove(newTaxonomyFile);
758                         rename(unclass.c_str(), newTaxonomyFile.c_str());
759                         
760                         m->mothurOutEndLine();
761                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
762                         
763                         #ifdef USE_MPI  
764                                 }
765                         #endif
766
767                         m->mothurOutEndLine();
768                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
769                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
770                         m->mothurOutEndLine();
771                 }
772                 
773                 //set taxonomy file as new current taxonomyfile
774                 string current = "";
775                 itTypes = outputTypes.find("taxonomy");
776                 if (itTypes != outputTypes.end()) {
777                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
778                 }
779                 
780                 current = "";
781                 itTypes = outputTypes.find("accnos");
782                 if (itTypes != outputTypes.end()) {
783                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
784                 }
785                 
786                 delete classify;
787                 
788                 return 0;
789         }
790         catch(exception& e) {
791                 m->errorOut(e, "ClassifySeqsCommand", "execute");
792                 exit(1);
793         }
794 }
795
796 /**************************************************************************************************/
797 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
798         try{
799                 string newTax, taxon;
800                 int level = 0;
801                 
802                 //keep what you have counting the levels
803                 while (tax.find_first_of(';') != -1) {
804                         //get taxon
805                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
806                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
807                         newTax += taxon;
808                         level++;
809                 }
810                 
811                 //add "unclassified" until you reach maxLevel
812                 while (level < maxlevel) {
813                         newTax += "unclassified;";
814                         level++;
815                 }
816                 
817                 return newTax;
818         }
819         catch(exception& e) {
820                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
821                 exit(1);
822         }
823 }
824
825 /**************************************************************************************************/
826
827 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string accnos, string filename) {
828         try {
829                 
830                 int num = 0;
831                 processIDS.clear();
832                 
833 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
834                 int process = 1;
835                 
836                 //loop through and create all the processes you want
837                 while (process != processors) {
838                         int pid = fork();
839                         
840                         if (pid > 0) {
841                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
842                                 process++;
843                         }else if (pid == 0){
844                                 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", filename);
845
846                                 //pass numSeqs to parent
847                                 ofstream out;
848                                 string tempFile = filename + toString(getpid()) + ".num.temp";
849                                 m->openOutputFile(tempFile, out);
850                                 out << num << endl;
851                                 out.close();
852
853                                 exit(0);
854                         }else { 
855                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
856                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
857                                 exit(0);
858                         }
859                 }
860                 
861                 //parent does its part
862                 num = driver(lines[0], taxFileName, tempTaxFile, accnos, filename);
863                 
864                 //force parent to wait until all the processes are done
865                 for (int i=0;i<processIDS.size();i++) { 
866                         int temp = processIDS[i];
867                         wait(&temp);
868                 }
869                 
870                 for (int i = 0; i < processIDS.size(); i++) {
871                         ifstream in;
872                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
873                         m->openInputFile(tempFile, in);
874                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
875                         in.close(); m->mothurRemove(m->getFullPathName(tempFile));
876                 }
877 #else
878                 //////////////////////////////////////////////////////////////////////////////////////////////////////
879                 //Windows version shared memory, so be careful when passing variables through the alignData struct. 
880                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
881                 //////////////////////////////////////////////////////////////////////////////////////////////////////
882                 
883                 vector<classifyData*> pDataArray; 
884                 DWORD   dwThreadIdArray[processors-1];
885                 HANDLE  hThreadArray[processors-1]; 
886                 
887                 //Create processor worker threads.
888                 for( int i=0; i<processors-1; i++ ){
889                         // Allocate memory for thread data.
890                         string extension = "";
891                         if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
892                         
893                         classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip);
894                         pDataArray.push_back(tempclass);
895                         
896                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
897                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
898                         hThreadArray[i] = CreateThread(NULL, 0, MyClassThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);  
899                         
900                 }
901                 
902                 //parent does its part
903                 num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", accnos + toString(processors-1) + ".temp", filename);
904                 processIDS.push_back((processors-1));
905                 
906                 //Wait until all threads have terminated.
907                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
908                 
909                 //Close all thread handles and free memory allocations.
910                 for(int i=0; i < pDataArray.size(); i++){
911                         num += pDataArray[i]->count;
912                         CloseHandle(hThreadArray[i]);
913                         delete pDataArray[i];
914                 }
915                 
916         #endif  
917         vector<string> nonBlankAccnosFiles;
918                 if (!(m->isBlank(accnos))) { nonBlankAccnosFiles.push_back(accnos); }
919                 else { m->mothurRemove(accnos); } //remove so other files can be renamed to it
920         
921                 for(int i=0;i<processIDS.size();i++){
922                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
923                         appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
924             if (!(m->isBlank(accnos + toString(processIDS[i]) + ".temp"))) {
925                                 nonBlankAccnosFiles.push_back(accnos + toString(processIDS[i]) + ".temp");
926                         }else { m->mothurRemove((accnos + toString(processIDS[i]) + ".temp"));  }
927
928                         m->mothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp"));
929                         m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp"));
930                 }
931                 
932         //append accnos files
933                 if (nonBlankAccnosFiles.size() != 0) { 
934                         rename(nonBlankAccnosFiles[0].c_str(), accnos.c_str());
935                         
936                         for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
937                                 appendTaxFiles(nonBlankAccnosFiles[h], accnos);
938                                 m->mothurRemove(nonBlankAccnosFiles[h]);
939                         }
940                 }else { //recreate the accnosfile if needed
941                         ofstream out;
942                         m->openOutputFile(accnos, out);
943                         out.close();
944                 }
945
946                 return num;
947                 
948         }
949         catch(exception& e) {
950                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
951                 exit(1);
952         }
953 }
954 /**************************************************************************************************/
955
956 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
957         try{
958                 
959                 ofstream output;
960                 ifstream input;
961                 m->openOutputFileAppend(filename, output);
962                 m->openInputFile(temp, input);
963                 
964                 while(char c = input.get()){
965                         if(input.eof())         {       break;                  }
966                         else                            {       output << c;    }
967                 }
968                 
969                 input.close();
970                 output.close();
971         }
972         catch(exception& e) {
973                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
974                 exit(1);
975         }
976 }
977
978 //**********************************************************************************************************************
979
980 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string accnos, string filename){
981         try {
982                 ofstream outTax;
983                 m->openOutputFile(taxFName, outTax);
984                 
985                 ofstream outTaxSimple;
986                 m->openOutputFile(tempTFName, outTaxSimple);
987                 
988                 ofstream outAcc;
989                 m->openOutputFile(accnos, outAcc);
990         
991                 ifstream inFASTA;
992                 m->openInputFile(filename, inFASTA);
993                 
994                 string taxonomy;
995
996                 inFASTA.seekg(filePos->start);
997
998                 bool done = false;
999                 int count = 0;
1000                 
1001                 while (!done) {
1002                         if (m->control_pressed) { 
1003                                 inFASTA.close();
1004                                 outTax.close();
1005                                 outTaxSimple.close();
1006                                 outAcc.close(); return 0; }
1007                 
1008                         Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
1009                         
1010                         if (candidateSeq->getName() != "") {
1011                         
1012                                 taxonomy = classify->getTaxonomy(candidateSeq);
1013                                 
1014                                 if (m->control_pressed) { delete candidateSeq; return 0; }
1015                                 
1016                                 if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); }
1017                                 
1018                                 //output confidence scores or not
1019                                 if (probs) {
1020                                         outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
1021                                 }else{
1022                                         outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
1023                                 }
1024                                 
1025                                 if (classify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
1026                                 
1027                                 outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
1028                                 
1029                                 count++;
1030                         }
1031                         delete candidateSeq;
1032                         
1033                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1034                                 unsigned long long pos = inFASTA.tellg();
1035                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
1036                         #else
1037                                 if (inFASTA.eof()) { break; }
1038                         #endif
1039                         
1040                         //report progress
1041                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1042                         
1043                 }
1044                 //report progress
1045                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1046                         
1047                 inFASTA.close();
1048                 outTax.close();
1049                 outTaxSimple.close();
1050                 outAcc.close();
1051                 
1052                 return count;
1053         }
1054         catch(exception& e) {
1055                 m->errorOut(e, "ClassifySeqsCommand", "driver");
1056                 exit(1);
1057         }
1058 }
1059 //**********************************************************************************************************************
1060 #ifdef USE_MPI
1061 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, MPI_File& accFile, vector<unsigned long long>& MPIPos){
1062         try {
1063                 MPI_Status statusNew; 
1064                 MPI_Status statusTemp; 
1065                 MPI_Status statusAcc; 
1066                 MPI_Status status; 
1067                 
1068                 int pid;
1069                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1070         
1071                 string taxonomy;
1072                 string outputString;
1073
1074                 for(int i=0;i<num;i++){
1075                 
1076                         if (m->control_pressed) { return 0; }
1077                 
1078                         //read next sequence
1079                         int length = MPIPos[start+i+1] - MPIPos[start+i];
1080                         char* buf4 = new char[length];
1081                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1082                         
1083                         string tempBuf = buf4;
1084                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
1085                         istringstream iss (tempBuf,istringstream::in);
1086                         delete buf4;
1087
1088                         Sequence* candidateSeq = new Sequence(iss);
1089                         
1090                         if (candidateSeq->getName() != "") {
1091                                 taxonomy = classify->getTaxonomy(candidateSeq);
1092                                 
1093                                 if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); }
1094                                 
1095                                 //output confidence scores or not
1096                                 if (probs) {
1097                                         outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
1098                                 }else{
1099                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
1100                                 }
1101                                 
1102                                 int length = outputString.length();
1103                                 char* buf2 = new char[length];
1104                                 memcpy(buf2, outputString.c_str(), length);
1105                                 
1106                                 MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
1107                                 delete buf2;
1108                                 
1109                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
1110                                 length = outputString.length();
1111                                 char* buf = new char[length];
1112                                 memcpy(buf, outputString.c_str(), length);
1113                                 
1114                                 MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
1115                                 delete buf;
1116                                 
1117                                 if (classify->getFlipped()) { 
1118                                         outputString =  candidateSeq->getName() + "\n";
1119                                         length = outputString.length();
1120                                         char* buf3 = new char[length];
1121                                         memcpy(buf3, outputString.c_str(), length);
1122                                         
1123                                         MPI_File_write_shared(accFile, buf3, length, MPI_CHAR, &statusAcc);
1124                                         delete buf3;
1125                                 }
1126                                 
1127                         }                               
1128                         delete candidateSeq;
1129                         
1130                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
1131                 }
1132                 
1133                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
1134                 
1135                 
1136                 return 1;
1137         }
1138         catch(exception& e) {
1139                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
1140                 exit(1);
1141         }
1142 }
1143
1144 //**********************************************************************************************************************
1145 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
1146         try {
1147         
1148                 nameMap.clear(); //remove old names
1149                 
1150                 MPI_File inMPI;
1151                 MPI_Offset size;
1152                 MPI_Status status;
1153
1154                 //char* inFileName = new char[nameFilename.length()];
1155                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
1156                 
1157                 char inFileName[1024];
1158                 strcpy(inFileName, nameFilename.c_str());
1159
1160                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
1161                 MPI_File_get_size(inMPI, &size);
1162                 //delete inFileName;
1163
1164                 char* buffer = new char[size];
1165                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
1166
1167                 string tempBuf = buffer;
1168                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
1169                 istringstream iss (tempBuf,istringstream::in);
1170                 delete buffer;
1171                 
1172                 string firstCol, secondCol;
1173                 while(!iss.eof()) {
1174                         iss >> firstCol >> secondCol; m->gobble(iss);
1175                         
1176                         vector<string> temp;
1177                         m->splitAtComma(secondCol, temp);
1178                         
1179                         nameMap[firstCol] = temp;  
1180                 }
1181         
1182                 MPI_File_close(&inMPI);
1183                 
1184                 return 1;
1185         }
1186         catch(exception& e) {
1187                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1188                 exit(1);
1189         }
1190 }
1191 #endif
1192 /**************************************************************************************************/