]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
a few modifications for 1.9
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "phylotree.h"
14 #include "knn.h"
15
16 //**********************************************************************************************************************
17
18 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
19         try {
20                 abort = false;
21                 
22                 //allow user to run help
23                 if(option == "help") { help(); abort = true; }
24                 
25                 else {
26                         
27                         //valid paramters for this command
28                         string AlignArray[] =  {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
29                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
30                         
31                         OptionParser parser(option);
32                         map<string, string> parameters = parser.getParameters(); 
33                         
34                         ValidParameters validParameter;
35                         map<string, string>::iterator it;
36                         
37                         //check to make sure all parameters are valid for command
38                         for (it = parameters.begin(); it != parameters.end(); it++) { 
39                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
40                         }
41                         
42                         //if the user changes the output directory command factory will send this info to us in the output parameter 
43                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
44                         
45                         //if the user changes the input directory command factory will send this info to us in the output parameter 
46                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
47                         if (inputDir == "not found"){   inputDir = "";          }
48                         else {
49                                 string path;
50                                 it = parameters.find("template");
51                                 //user has given a template file
52                                 if(it != parameters.end()){ 
53                                         path = hasPath(it->second);
54                                         //if the user has not given a path then, add inputdir. else leave path alone.
55                                         if (path == "") {       parameters["template"] = inputDir + it->second;         }
56                                 }
57                                 
58                                 it = parameters.find("taxonomy");
59                                 //user has given a template file
60                                 if(it != parameters.end()){ 
61                                         path = hasPath(it->second);
62                                         //if the user has not given a path then, add inputdir. else leave path alone.
63                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
64                                 }
65                         }
66
67                         //check for required parameters
68                         templateFileName = validParameter.validFile(parameters, "template", true);
69                         if (templateFileName == "not found") { 
70                                 m->mothurOut("template is a required parameter for the classify.seqs command."); 
71                                 m->mothurOutEndLine();
72                                 abort = true; 
73                         }
74                         else if (templateFileName == "not open") { abort = true; }      
75                         
76                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
77                         if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true;  }
78                         else { 
79                                 splitAtDash(fastaFileName, fastaFileNames);
80                                 
81                                 //go through files and make sure they are good, if not, then disregard them
82                                 for (int i = 0; i < fastaFileNames.size(); i++) {
83                                         if (inputDir != "") {
84                                                 string path = hasPath(fastaFileNames[i]);
85                                                 //if the user has not given a path then, add inputdir. else leave path alone.
86                                                 if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
87                                         }
88                                         
89                                         int ableToOpen;
90                                         
91                                         #ifdef USE_MPI  
92                                                 int pid;
93                                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
94                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
95                                 
96                                                 if (pid == 0) {
97                                         #endif
98                                         
99                                         ifstream in;
100                                         ableToOpen = openInputFile(fastaFileNames[i], in);
101                                         in.close();
102                                         
103                                         #ifdef USE_MPI  
104                                                         for (int j = 1; j < processors; j++) {
105                                                                 MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
106                                                         }
107                                                 }else{
108                                                         MPI_Status status;
109                                                         MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
110                                                 }
111                                                 
112                                         #endif
113                                         
114                                         if (ableToOpen == 1) { 
115                                                 m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); 
116                                                 //erase from file list
117                                                 fastaFileNames.erase(fastaFileNames.begin()+i);
118                                                 i--;
119                                         }
120                                         
121                                 }
122                                 
123                                 //make sure there is at least one valid file left
124                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
125                         }
126
127                         
128                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
129                         if (taxonomyFileName == "not found") { 
130                                 m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
131                                 m->mothurOutEndLine();
132                                 abort = true; 
133                         }
134                         else if (taxonomyFileName == "not open") { abort = true; }      
135                         
136                         
137                         namefile = validParameter.validFile(parameters, "name", false);
138                         if (namefile == "not found") { namefile = "";  }
139
140                         else { 
141                                 splitAtDash(namefile, namefileNames);
142                                 
143                                 //go through files and make sure they are good, if not, then disregard them
144                                 for (int i = 0; i < namefileNames.size(); i++) {
145                                         if (inputDir != "") {
146                                                 string path = hasPath(namefileNames[i]);
147                                                 //if the user has not given a path then, add inputdir. else leave path alone.
148                                                 if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
149                                         }
150                                         int ableToOpen;
151                                         
152                                         #ifdef USE_MPI  
153                                                 int pid;
154                                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
155                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
156                                 
157                                                 if (pid == 0) {
158                                         #endif
159
160                                         ifstream in;
161                                         ableToOpen = openInputFile(namefileNames[i], in);
162                                         in.close();
163                                         
164                                         #ifdef USE_MPI  
165                                                         for (int j = 1; j < processors; j++) {
166                                                                 MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
167                                                         }
168                                                 }else{
169                                                         MPI_Status status;
170                                                         MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
171                                                 }
172                                                 
173                                         #endif
174                                         if (ableToOpen == 1) {  m->mothurOut("Unable to match name file with fasta file."); m->mothurOutEndLine(); abort = true;        }
175                                         
176                                 }
177                         }
178
179                         if (namefile != "") {
180                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
181                         }
182                         
183                         //check for optional parameter and set defaults
184                         // ...at some point should added some additional type checking...
185                         string temp;
186                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
187                         convert(temp, kmerSize); 
188                         
189                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
190                         convert(temp, processors); 
191                         
192                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
193                         
194                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
195                         
196                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
197                         convert(temp, match);  
198                         
199                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
200                         convert(temp, misMatch);  
201                         
202                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
203                         convert(temp, gapOpen);  
204                         
205                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
206                         convert(temp, gapExtend); 
207                         
208                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
209                         convert(temp, numWanted);
210                         
211                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
212                         convert(temp, cutoff);
213                         
214                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
215                         probs = isTrue(temp);
216                         
217                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
218                         convert(temp, iters); 
219
220
221                         
222                         if ((method == "bayesian") && (search != "kmer"))  { 
223                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
224                                 search = "kmer";
225                         }
226                 }
227                 
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
231                 exit(1);
232         }
233 }
234
235 //**********************************************************************************************************************
236
237 ClassifySeqsCommand::~ClassifySeqsCommand(){    
238
239         if (abort == false) {
240                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
241         }
242 }
243
244 //**********************************************************************************************************************
245
246 void ClassifySeqsCommand::help(){
247         try {
248                 m->mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
249                 m->mothurOut("The classify.seqs command parameters are template, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
250                 m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
251                 m->mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
252                 m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
253                 m->mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
254                 m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
255                 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
256                 #ifdef USE_MPI
257                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
258                 #endif
259                 m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
260                 m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
261                 m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
262                 m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
263                 m->mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
264                 m->mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
265                 m->mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n");
266                 m->mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n");
267                 m->mothurOut("The classify.seqs command should be in the following format: \n");
268                 m->mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
269                 m->mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
270                 m->mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
271                 m->mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
272                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
273         }
274         catch(exception& e) {
275                 m->errorOut(e, "ClassifySeqsCommand", "help");
276                 exit(1);
277         }
278 }
279
280
281 //**********************************************************************************************************************
282
283 int ClassifySeqsCommand::execute(){
284         try {
285                 if (abort == true) {    return 0;       }
286                 
287                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
288                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
289                 else {
290                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
291                         m->mothurOutEndLine();
292                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
293                 }
294                 
295                 if (m->control_pressed) { delete classify; return 0; }
296                 
297                 vector<string> outputNames;
298                                 
299                 for (int s = 0; s < fastaFileNames.size(); s++) {
300                 
301                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
302                         
303                         if (outputDir == "") { outputDir += hasPath(fastaFileNames[s]); }
304                         string newTaxonomyFile = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + getRootName(getSimpleName(taxonomyFileName)) + "taxonomy";
305                         string tempTaxonomyFile = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
306                         string taxSummary = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + getRootName(getSimpleName(taxonomyFileName)) + "tax.summary";
307                         
308                         outputNames.push_back(newTaxonomyFile);
309                         outputNames.push_back(taxSummary);
310                         
311                         int start = time(NULL);
312                         int numFastaSeqs = 0;
313                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
314                         
315 #ifdef USE_MPI  
316                                 int pid, end, numSeqsPerProcessor; 
317                                 int tag = 2001;
318                                 vector<long> MPIPos;
319                                 
320                                 MPI_Status status; 
321                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
322                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
323
324                                 MPI_File inMPI;
325                                 MPI_File outMPINewTax;
326                                 MPI_File outMPITempTax;
327                                                         
328                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
329                                 int inMode=MPI_MODE_RDONLY; 
330                                 
331                                 //char* outNewTax = new char[newTaxonomyFile.length()];
332                                 //memcpy(outNewTax, newTaxonomyFile.c_str(), newTaxonomyFile.length());
333                                 
334                                 char outNewTax[1024];
335                                 strcpy(outNewTax, newTaxonomyFile.c_str());
336
337                                 //char* outTempTax = new char[tempTaxonomyFile.length()];
338                                 //memcpy(outTempTax, tempTaxonomyFile.c_str(), tempTaxonomyFile.length());
339                                 
340                                 char outTempTax[1024];
341                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
342
343                                 //char* inFileName = new char[fastaFileNames[s].length()];
344                                 //memcpy(inFileName, fastaFileNames[s].c_str(), fastaFileNames[s].length());
345                                 
346                                 char inFileName[1024];
347                                 strcpy(inFileName, fastaFileNames[s].c_str());
348
349                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
350                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
351                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
352                                 
353                                 //delete outNewTax;
354                                 //delete outTempTax;
355                                 //delete inFileName;
356
357                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
358
359                                 if(namefile != "") {  MPIReadNamesFile(namefileNames[s]);  }
360                                 
361                                 if (pid == 0) { //you are the root process 
362                                         
363                                         MPIPos = setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
364                                         
365                                         //send file positions to all processes
366                                         MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
367                                         MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos   
368                                         
369                                         //figure out how many sequences you have to align
370                                         numSeqsPerProcessor = numFastaSeqs / processors;
371                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
372                                         int startIndex =  pid * numSeqsPerProcessor;
373                                 
374                                         //align your part
375                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
376                                         
377                                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {    remove(outputNames[i].c_str()); } delete classify; return 0;  }
378                                         
379                                         for (int i = 1; i < processors; i++) {
380                                                 int done;
381                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
382                                         }
383                                 }else{ //you are a child process
384                                         MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
385                                         MPIPos.resize(numFastaSeqs+1);
386                                         MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
387                                         
388                                         //figure out how many sequences you have to align
389                                         numSeqsPerProcessor = numFastaSeqs / processors;
390                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
391                                         int startIndex =  pid * numSeqsPerProcessor;
392                                         
393                                         //align your part
394                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
395                                         
396                                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
397
398                                         int done = 0;
399                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
400                                 }
401                                 
402                                 //close files 
403                                 MPI_File_close(&inMPI);
404                                 MPI_File_close(&outMPINewTax);
405                                 MPI_File_close(&outMPITempTax);
406                                 
407 #else
408                         //read namefile
409                         if(namefile != "") {
410                                 nameMap.clear(); //remove old names
411                                 
412                                 ifstream inNames;
413                                 openInputFile(namefileNames[s], inNames);
414                                 
415                                 string firstCol, secondCol;
416                                 while(!inNames.eof()) {
417                                         inNames >> firstCol >> secondCol; gobble(inNames);
418                                         nameMap[firstCol] = getNumNames(secondCol);  //ex. seq1 seq1,seq3,seq5 -> seq1 = 3.
419                                 }
420                                 inNames.close();
421                         }
422
423         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
424                         if(processors == 1){
425                                 ifstream inFASTA;
426                                 openInputFile(fastaFileNames[s], inFASTA);
427                                 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
428                                 inFASTA.close();
429                                 
430                                 lines.push_back(new linePair(0, numFastaSeqs));
431                                 
432                                 driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
433                         }
434                         else{
435                                 vector<int> positions;
436                                 processIDS.resize(0);
437                                 
438                                 ifstream inFASTA;
439                                 openInputFile(fastaFileNames[s], inFASTA);
440                                 
441                                 string input;
442                                 while(!inFASTA.eof()){
443                                         input = getline(inFASTA);
444                                         if (input.length() != 0) {
445                                                 if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
446                                         }
447                                 }
448                                 inFASTA.close();
449                                 
450                                 numFastaSeqs = positions.size();
451                                 
452                                 int numSeqsPerProcessor = numFastaSeqs / processors;
453                                 
454                                 for (int i = 0; i < processors; i++) {
455                                         int startPos = positions[ i * numSeqsPerProcessor ];
456                                         if(i == processors - 1){
457                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
458                                         }
459                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
460                                 }
461                                 createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
462                                 
463                                 rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
464                                 rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
465                                 
466                                 for(int i=1;i<processors;i++){
467                                         appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
468                                         appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
469                                         remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
470                                         remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
471                                 }
472                                 
473                         }
474         #else
475                         ifstream inFASTA;
476                         openInputFile(fastaFileNames[s], inFASTA);
477                         numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
478                         inFASTA.close();
479                         
480                         lines.push_back(new linePair(0, numFastaSeqs));
481                         
482                         driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
483         #endif  
484 #endif
485
486                 #ifdef USE_MPI  
487                         if (pid == 0) {  //this part does not need to be paralellized
488                 #endif
489
490                         //make taxonomy tree from new taxonomy file 
491                         PhyloTree taxaBrowser;
492                         
493                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0; }
494                         
495                         ifstream in;
496                         openInputFile(tempTaxonomyFile, in);
497                 
498                         //read in users taxonomy file and add sequences to tree
499                         string name, taxon;
500                         while(!in.eof()){
501                                 in >> name >> taxon; gobble(in);
502                                 
503                                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
504                                 
505                                 if (namefile != "") {
506                                         itNames = nameMap.find(name);
507                 
508                                         if (itNames == nameMap.end()) { 
509                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
510                                         }else{
511                                                 for (int i = 0; i < itNames->second; i++) { 
512                                                         taxaBrowser.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
513                                                 }
514                                         }
515                                 }else {  taxaBrowser.addSeqToTree(name, taxon);  } //add it once
516                         }
517                         in.close();
518         
519                         taxaBrowser.assignHeirarchyIDs(0);
520                         
521                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
522
523                         taxaBrowser.binUnclassified();
524                         
525                         remove(tempTaxonomyFile.c_str());
526                         
527                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0; }
528
529                         
530                         //print summary file
531                         ofstream outTaxTree;
532                         openOutputFile(taxSummary, outTaxTree);
533                         taxaBrowser.print(outTaxTree);
534                         outTaxTree.close();
535                         
536                         //output taxonomy with the unclassified bins added
537                         ifstream inTax;
538                         openInputFile(newTaxonomyFile, inTax);
539                         
540                         ofstream outTax;
541                         string unclass = newTaxonomyFile + ".unclass.temp";
542                         openOutputFile(unclass, outTax);
543                         
544                         //get maxLevel from phylotree so you know how many 'unclassified's to add
545                         int maxLevel = taxaBrowser.getMaxLevel();
546                         
547                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
548                         while (!inTax.eof()) {
549                                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
550
551                                 inTax >> name >> taxon; gobble(inTax);
552                                 
553                                 string newTax = addUnclassifieds(taxon, maxLevel);
554                                 
555                                 outTax << name << '\t' << newTax << endl;
556                         }
557                         inTax.close();  
558                         outTax.close();
559                         
560                         remove(newTaxonomyFile.c_str());
561                         rename(unclass.c_str(), newTaxonomyFile.c_str());
562                         
563                         #ifdef USE_MPI  
564                                 }
565                         #endif
566
567                         m->mothurOutEndLine();
568                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
569                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
570                         m->mothurOutEndLine();
571
572                         
573                         m->mothurOutEndLine();
574                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
575                 }
576                 
577                 delete classify;
578                 return 0;
579         }
580         catch(exception& e) {
581                 m->errorOut(e, "ClassifySeqsCommand", "execute");
582                 exit(1);
583         }
584 }
585
586 /**************************************************************************************************/
587 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
588         try{
589                 string newTax, taxon;
590                 int level = 0;
591                 
592                 //keep what you have counting the levels
593                 while (tax.find_first_of(';') != -1) {
594                         //get taxon
595                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
596                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
597                         newTax += taxon;
598                         level++;
599                 }
600                 
601                 //add "unclassified" until you reach maxLevel
602                 while (level < maxlevel) {
603                         newTax += "unclassified;";
604                         level++;
605                 }
606                 
607                 return newTax;
608         }
609         catch(exception& e) {
610                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
611                 exit(1);
612         }
613 }
614
615 /**************************************************************************************************/
616
617 void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
618         try {
619 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
620                 int process = 0;
621                 //              processIDS.resize(0);
622                 
623                 //loop through and create all the processes you want
624                 while (process != processors) {
625                         int pid = fork();
626                         
627                         if (pid > 0) {
628                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
629                                 process++;
630                         }else if (pid == 0){
631                                 driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
632                                 exit(0);
633                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
634                 }
635                 
636                 //force parent to wait until all the processes are done
637                 for (int i=0;i<processors;i++) { 
638                         int temp = processIDS[i];
639                         wait(&temp);
640                 }
641 #endif          
642         }
643         catch(exception& e) {
644                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
645                 exit(1);
646         }
647 }
648 /**************************************************************************************************/
649
650 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
651         try{
652                 
653                 ofstream output;
654                 ifstream input;
655                 openOutputFileAppend(filename, output);
656                 openInputFile(temp, input);
657                 
658                 while(char c = input.get()){
659                         if(input.eof())         {       break;                  }
660                         else                            {       output << c;    }
661                 }
662                 
663                 input.close();
664                 output.close();
665         }
666         catch(exception& e) {
667                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
668                 exit(1);
669         }
670 }
671
672 //**********************************************************************************************************************
673
674 int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){
675         try {
676                 ofstream outTax;
677                 openOutputFile(taxFName, outTax);
678                 
679                 ofstream outTaxSimple;
680                 openOutputFile(tempTFName, outTaxSimple);
681         
682                 ifstream inFASTA;
683                 openInputFile(filename, inFASTA);
684
685                 inFASTA.seekg(line->start);
686                 
687                 string taxonomy;
688
689                 for(int i=0;i<line->numSeqs;i++){
690                         if (m->control_pressed) { return 0; }
691                         
692                         Sequence* candidateSeq = new Sequence(inFASTA);
693                         
694                         if (candidateSeq->getName() != "") {
695                                 taxonomy = classify->getTaxonomy(candidateSeq);
696                                 
697                                 if (m->control_pressed) { delete candidateSeq; return 0; }
698
699                                 if (taxonomy != "bad seq") {
700                                         //output confidence scores or not
701                                         if (probs) {
702                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
703                                         }else{
704                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
705                                         }
706                                         
707                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
708                                 }
709                         }                               
710                         delete candidateSeq;
711                         
712                         if((i+1) % 100 == 0){
713                                 m->mothurOut("Classifying sequence " + toString(i+1)); m->mothurOutEndLine();
714                         }
715                 }
716                 
717                 inFASTA.close();
718                 outTax.close();
719                 outTaxSimple.close();
720                 
721                 return 1;
722         }
723         catch(exception& e) {
724                 m->errorOut(e, "ClassifySeqsCommand", "driver");
725                 exit(1);
726         }
727 }
728 //**********************************************************************************************************************
729 #ifdef USE_MPI
730 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<long>& MPIPos){
731         try {
732                 MPI_Status statusNew; 
733                 MPI_Status statusTemp; 
734                 MPI_Status status; 
735                 
736                 int pid;
737                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
738         
739                 string taxonomy;
740                 string outputString;
741
742                 for(int i=0;i<num;i++){
743                 
744                         if (m->control_pressed) { return 0; }
745                 
746                         //read next sequence
747                         int length = MPIPos[start+i+1] - MPIPos[start+i];
748                         char* buf4 = new char[length];
749                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
750                         
751                         string tempBuf = buf4;
752                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
753                         istringstream iss (tempBuf,istringstream::in);
754                         delete buf4;
755
756                         Sequence* candidateSeq = new Sequence(iss);
757                         
758                         if (candidateSeq->getName() != "") {
759                                 taxonomy = classify->getTaxonomy(candidateSeq);
760                                 
761                                 if (taxonomy != "bad seq") {
762                                         //output confidence scores or not
763                                         if (probs) {
764                                                 outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
765                                         }else{
766                                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
767                                         }
768                                         
769                                         int length = outputString.length();
770                                         char* buf2 = new char[length];
771                                         memcpy(buf2, outputString.c_str(), length);
772                                 
773                                         MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
774                                         delete buf2;
775
776                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
777                                         length = outputString.length();
778                                         char* buf = new char[length];
779                                         memcpy(buf, outputString.c_str(), length);
780                                 
781                                         MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
782                                         delete buf;
783                                 }
784                         }                               
785                         delete candidateSeq;
786                         
787                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
788                 }
789                 
790                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
791                 
792                 
793                 return 1;
794         }
795         catch(exception& e) {
796                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
797                 exit(1);
798         }
799 }
800
801 //**********************************************************************************************************************
802 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
803         try {
804         
805                 nameMap.clear(); //remove old names
806                 
807                 MPI_File inMPI;
808                 MPI_Offset size;
809                 MPI_Status status;
810
811                 //char* inFileName = new char[nameFilename.length()];
812                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
813                 
814                 char inFileName[1024];
815                 strcpy(inFileName, nameFilename.c_str());
816
817                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
818                 MPI_File_get_size(inMPI, &size);
819                 //delete inFileName;
820
821                 char* buffer = new char[size];
822                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
823
824                 string tempBuf = buffer;
825                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
826                 istringstream iss (tempBuf,istringstream::in);
827                 delete buffer;
828                 
829                 string firstCol, secondCol;
830                 while(!iss.eof()) {
831                         iss >> firstCol >> secondCol; gobble(iss);
832                         nameMap[firstCol] = getNumNames(secondCol);  //ex. seq1 seq1,seq3,seq5 -> seq1 = 3.
833                 }
834         
835                 MPI_File_close(&inMPI);
836                 
837                 return 1;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
841                 exit(1);
842         }
843 }
844 #endif
845 /**************************************************************************************************/