]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
working on current change
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "phylotree.h"
14 #include "phylosummary.h"
15 #include "knn.h"
16
17
18 //**********************************************************************************************************************
19 vector<string> ClassifySeqsCommand::setParameters(){    
20         try {
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
22                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
23                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
24                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
25                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
26                 CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
27                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
28                 CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
31                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
32                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
33                 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
34                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
35                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
36                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
37                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                 
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ClassifySeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string ClassifySeqsCommand::getHelpString(){    
52         try {
53                 string helpString = "";
54                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
55                 helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
56                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
57                 helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
58                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
59                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
60                 helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
61                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
62                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
63 #ifdef USE_MPI
64                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
65 #endif
66                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
67                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
68                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
69                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
70                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
71                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
72                 helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
73                 helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
74                 helpString += "The classify.seqs command should be in the following format: \n";
75                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
76                 helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n";
77                 helpString += "The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n";
78                 helpString += "The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n";
79                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
80                 return helpString;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ClassifySeqsCommand", "getHelpString");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 ClassifySeqsCommand::ClassifySeqsCommand(){     
89         try {
90                 abort = true; calledHelp = true; 
91                 setParameters();
92                 vector<string> tempOutNames;
93                 outputTypes["taxonomy"] = tempOutNames;
94                 outputTypes["taxsummary"] = tempOutNames;
95                 outputTypes["matchdist"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string, string> parameters = parser.getParameters(); 
116                         
117                         ValidParameters validParameter("classify.seqs");
118                         map<string, string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126                         vector<string> tempOutNames;
127                         outputTypes["taxonomy"] = tempOutNames;
128                         outputTypes["taxsummary"] = tempOutNames;
129                         outputTypes["matchdist"] = tempOutNames;
130                         
131                         //if the user changes the output directory command factory will send this info to us in the output parameter 
132                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("reference");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
145                                 }
146                                 
147                                 it = parameters.find("taxonomy");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
153                                 }
154                                 
155                                 it = parameters.find("group");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
161                                 }
162                         }
163
164                         //check for required parameters
165                         templateFileName = validParameter.validFile(parameters, "reference", true);
166                         if (templateFileName == "not found") { 
167                                 m->mothurOut("reference is a required parameter for the classify.seqs command."); 
168                                 m->mothurOutEndLine();
169                                 abort = true; 
170                         }
171                         else if (templateFileName == "not open") { abort = true; }      
172                         
173                                                 
174                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
175                         if (fastaFileName == "not found") {                             
176                                 //if there is a current fasta file, use it
177                                 string filename = m->getFastaFile(); 
178                                 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
179                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
180                         }
181                         else { 
182                                 m->splitAtDash(fastaFileName, fastaFileNames);
183                                 
184                                 //go through files and make sure they are good, if not, then disregard them
185                                 for (int i = 0; i < fastaFileNames.size(); i++) {
186                                         
187                                         bool ignore = false;
188                                         if (fastaFileNames[i] == "current") { 
189                                                 fastaFileNames[i] = m->getFastaFile(); 
190                                                 if (fastaFileNames[i] != "") {  m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
191                                                 else {  
192                                                         m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
193                                                         //erase from file list
194                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
195                                                         i--;
196                                                 }
197                                         }
198                                         
199                                         if (!ignore) {
200                                                 
201                                                 if (inputDir != "") {
202                                                         string path = m->hasPath(fastaFileNames[i]);
203                                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                                         if (path == "") {       fastaFileNames[i] = inputDir + fastaFileNames[i];               }
205                                                 }
206                                                 
207                                                 int ableToOpen;
208                                                 
209                                                 ifstream in;
210                                                 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
211                                         
212                                                 //if you can't open it, try default location
213                                                 if (ableToOpen == 1) {
214                                                         if (m->getDefaultPath() != "") { //default path is set
215                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
216                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
217                                                                 ifstream in2;
218                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
219                                                                 in2.close();
220                                                                 fastaFileNames[i] = tryPath;
221                                                         }
222                                                 }
223                                                 
224                                                 if (ableToOpen == 1) {
225                                                         if (m->getOutputDir() != "") { //default path is set
226                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
227                                                                 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
228                                                                 ifstream in2;
229                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
230                                                                 in2.close();
231                                                                 fastaFileNames[i] = tryPath;
232                                                         }
233                                                 }
234                                                 
235                                                 in.close();
236                                                 
237                                                 if (ableToOpen == 1) { 
238                                                         m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
239                                                         //erase from file list
240                                                         fastaFileNames.erase(fastaFileNames.begin()+i);
241                                                         i--;
242                                                 }else {
243                                                         m->setFastaFile(fastaFileNames[i]);
244                                                 }
245                                         }
246                                         
247                                 }
248                                 
249                                 //make sure there is at least one valid file left
250                                 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
251                         }
252
253                         
254                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
255                         if (taxonomyFileName == "not found") { 
256                                 m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
257                                 m->mothurOutEndLine();
258                                 abort = true; 
259                         }
260                         else if (taxonomyFileName == "not open") { abort = true; }      
261                         
262                         
263                         namefile = validParameter.validFile(parameters, "name", false);
264                         if (namefile == "not found") { namefile = "";  }
265
266                         else { 
267                                 m->splitAtDash(namefile, namefileNames);
268                                 
269                                 //go through files and make sure they are good, if not, then disregard them
270                                 for (int i = 0; i < namefileNames.size(); i++) {
271                                         bool ignore = false;
272                                         if (namefileNames[i] == "current") { 
273                                                 namefileNames[i] = m->getNameFile(); 
274                                                 if (namefileNames[i] != "") {  m->mothurOut("Using " + namefileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
275                                                 else {  
276                                                         m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
277                                                         //erase from file list
278                                                         namefileNames.erase(namefileNames.begin()+i);
279                                                         i--;
280                                                 }
281                                         }
282                                         
283                                         if (!ignore) {
284                                                 
285                                                 if (inputDir != "") {
286                                                         string path = m->hasPath(namefileNames[i]);
287                                                         //if the user has not given a path then, add inputdir. else leave path alone.
288                                                         if (path == "") {       namefileNames[i] = inputDir + namefileNames[i];         }
289                                                 }
290                                                 int ableToOpen;
291                                                 
292                                                 ifstream in;
293                                                 ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
294                                         
295                                                 //if you can't open it, try default location
296                                                 if (ableToOpen == 1) {
297                                                         if (m->getDefaultPath() != "") { //default path is set
298                                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
299                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
300                                                                 ifstream in2;
301                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
302                                                                 in2.close();
303                                                                 namefileNames[i] = tryPath;
304                                                         }
305                                                 }
306                                                 
307                                                 if (ableToOpen == 1) {
308                                                         if (m->getOutputDir() != "") { //default path is set
309                                                                 string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
310                                                                 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
311                                                                 ifstream in2;
312                                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
313                                                                 in2.close();
314                                                                 namefileNames[i] = tryPath;
315                                                         }
316                                                 }
317                                                 in.close();
318                                                 
319                                                 if (ableToOpen == 1) { 
320                                                         m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();  abort = true;
321                                                         //erase from file list
322                                                         namefileNames.erase(namefileNames.begin()+i);
323                                                         i--;
324                                                 }else {
325                                                         m->setNameFile(namefileNames[i]);
326                                                 }
327                                         }
328                                 }
329                         }
330
331                         if (namefile != "") {
332                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
333                         }
334                         
335                         groupfile = validParameter.validFile(parameters, "group", false);
336                         if (groupfile == "not found") { groupfile = "";  }
337                         else { 
338                                 m->splitAtDash(groupfile, groupfileNames);
339                                 
340                                 //go through files and make sure they are good, if not, then disregard them
341                                 for (int i = 0; i < groupfileNames.size(); i++) {
342                                         if (inputDir != "") {
343                                                 string path = m->hasPath(groupfileNames[i]);
344                                                 //if the user has not given a path then, add inputdir. else leave path alone.
345                                                 if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
346                                         }
347                                         int ableToOpen;
348                                         
349                                         ifstream in;
350                                         ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
351                                 
352                                         //if you can't open it, try default location
353                                         if (ableToOpen == 1) {
354                                                 if (m->getDefaultPath() != "") { //default path is set
355                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
356                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
357                                                         ifstream in2;
358                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
359                                                         in2.close();
360                                                         groupfileNames[i] = tryPath;
361                                                 }
362                                         }
363                                         
364                                         if (ableToOpen == 1) {
365                                                 if (m->getOutputDir() != "") { //default path is set
366                                                         string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
367                                                         m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
368                                                         ifstream in2;
369                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
370                                                         in2.close();
371                                                         groupfileNames[i] = tryPath;
372                                                 }
373                                         }
374                                         
375                                         in.close();
376                                         
377                                         if (ableToOpen == 1) { 
378                                                 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
379                                                 //erase from file list
380                                                 groupfileNames.erase(groupfileNames.begin()+i);
381                                                 i--;
382                                         }else {
383                                                 m->setGroupFile(groupfileNames[i]);
384                                         }
385                                 }
386                         }
387
388                         if (groupfile != "") {
389                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
390                         }else {
391                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
392                         }
393                         
394                         //check for optional parameter and set defaults
395                         // ...at some point should added some additional type checking...
396                         string temp;
397                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
398                         convert(temp, kmerSize); 
399                         
400                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
401                         m->setProcessors(temp);
402                         convert(temp, processors); 
403                         
404                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
405                         
406                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
407                         
408                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
409                         convert(temp, match);  
410                         
411                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
412                         convert(temp, misMatch);  
413                         
414                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
415                         convert(temp, gapOpen);  
416                         
417                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
418                         convert(temp, gapExtend); 
419                         
420                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
421                         convert(temp, numWanted);
422                         
423                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
424                         convert(temp, cutoff);
425                         
426                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
427                         probs = m->isTrue(temp);
428                         
429                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
430                         convert(temp, iters); 
431
432
433                         
434                         if ((method == "bayesian") && (search != "kmer"))  { 
435                                 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
436                                 search = "kmer";
437                         }
438                 }
439                 
440         }
441         catch(exception& e) {
442                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
443                 exit(1);
444         }
445 }
446
447 //**********************************************************************************************************************
448 ClassifySeqsCommand::~ClassifySeqsCommand(){    
449         if (abort == false) {
450                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
451         }
452 }
453 //**********************************************************************************************************************
454
455 int ClassifySeqsCommand::execute(){
456         try {
457                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
458                 
459                 if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
460                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
461                 else {
462                         m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
463                         m->mothurOutEndLine();
464                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
465                 }
466                 
467                 if (m->control_pressed) { delete classify; return 0; }
468                 
469                                 
470                 for (int s = 0; s < fastaFileNames.size(); s++) {
471                 
472                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
473                         
474                         string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName));
475                         RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
476                         if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
477                         RippedTaxName +=  "."; 
478                 
479                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
480                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
481                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
482                         string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
483                         
484                         if ((method == "knn") && (search == "distance")) { 
485                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
486                                 classify->setDistName(DistName);  outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
487                         }
488                         
489                         outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
490                         outputNames.push_back(taxSummary);      outputTypes["taxsummary"].push_back(taxSummary);
491                         
492                         int start = time(NULL);
493                         int numFastaSeqs = 0;
494                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
495                         
496 #ifdef USE_MPI  
497                                 int pid, numSeqsPerProcessor; 
498                                 int tag = 2001;
499                                 vector<unsigned long int> MPIPos;
500                                 
501                                 MPI_Status status; 
502                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
503                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
504
505                                 MPI_File inMPI;
506                                 MPI_File outMPINewTax;
507                                 MPI_File outMPITempTax;
508                                                         
509                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
510                                 int inMode=MPI_MODE_RDONLY; 
511                                 
512                                 char outNewTax[1024];
513                                 strcpy(outNewTax, newTaxonomyFile.c_str());
514                                 
515                                 char outTempTax[1024];
516                                 strcpy(outTempTax, tempTaxonomyFile.c_str());
517                                 
518                                 char inFileName[1024];
519                                 strcpy(inFileName, fastaFileNames[s].c_str());
520
521                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
522                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
523                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
524                                 
525                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
526                                 
527                                 if (pid == 0) { //you are the root process 
528                                         
529                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
530                                         
531                                         //send file positions to all processes
532                                         for(int i = 1; i < processors; i++) { 
533                                                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
534                                                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
535                                         }
536                                         
537                                         //figure out how many sequences you have to align
538                                         numSeqsPerProcessor = numFastaSeqs / processors;
539                                         int startIndex =  pid * numSeqsPerProcessor;
540                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
541                                         
542                                 
543                                         //align your part
544                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
545                                         
546                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0;  }
547                                         
548                                         for (int i = 1; i < processors; i++) {
549                                                 int done;
550                                                 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
551                                         }
552                                 }else{ //you are a child process
553                                         MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
554                                         MPIPos.resize(numFastaSeqs+1);
555                                         MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
556                                         
557                                         //figure out how many sequences you have to align
558                                         numSeqsPerProcessor = numFastaSeqs / processors;
559                                         int startIndex =  pid * numSeqsPerProcessor;
560                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
561                                         
562                                         
563                                         //align your part
564                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
565                                         
566                                         if (m->control_pressed) {  outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
567
568                                         int done = 0;
569                                         MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
570                                 }
571                                 
572                                 //close files 
573                                 MPI_File_close(&inMPI);
574                                 MPI_File_close(&outMPINewTax);
575                                 MPI_File_close(&outMPITempTax);
576                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
577                                 
578 #else
579                 
580                         vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
581                                 
582                         for (int i = 0; i < (positions.size()-1); i++) {
583                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
584                         }       
585                         
586                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
587                         if(processors == 1){
588                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
589                         }
590                         else{
591                                 processIDS.resize(0);
592                                 
593                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
594                                 
595                         }
596         #else
597                         numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
598         #endif  
599 #endif
600
601                 m->mothurOutEndLine();
602                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
603                 start = time(NULL);
604
605
606                 #ifdef USE_MPI  
607                         if (pid == 0) {  //this part does not need to be paralellized
608                         
609                                 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();  MPIReadNamesFile(namefileNames[s]);  m->mothurOut("  Done."); m->mothurOutEndLine(); }
610                 #else
611                         //read namefile
612                         if(namefile != "") {
613                         
614                             m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
615                                 
616                                 nameMap.clear(); //remove old names
617                                 
618                                 ifstream inNames;
619                                 m->openInputFile(namefileNames[s], inNames);
620                                 
621                                 string firstCol, secondCol;
622                                 while(!inNames.eof()) {
623                                         inNames >> firstCol >> secondCol; m->gobble(inNames);
624                                         
625                                         vector<string> temp;
626                                         m->splitAtComma(secondCol, temp);
627                         
628                                         nameMap[firstCol] = temp;  
629                                 }
630                                 inNames.close();
631                                 
632                                 m->mothurOut("  Done."); m->mothurOutEndLine();
633                         }
634                 #endif
635
636                         string group = "";
637                         if (groupfile != "") {  group = groupfileNames[s]; }
638                         
639                         PhyloSummary taxaSum(taxonomyFileName, group);
640                         
641                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
642                 
643                         if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
644                         else {
645                                 ifstream in;
646                                 m->openInputFile(tempTaxonomyFile, in);
647                                 
648                                 //read in users taxonomy file and add sequences to tree
649                                 string name, taxon;
650                                 
651                                 while(!in.eof()){
652                                         in >> name >> taxon; m->gobble(in);
653                                         
654                                         itNames = nameMap.find(name);
655                 
656                                         if (itNames == nameMap.end()) { 
657                                                 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
658                                         }else{
659                                                 for (int i = 0; i < itNames->second.size(); i++) { 
660                                                         taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
661                                                 }
662                                                 itNames->second.clear();
663                                                 nameMap.erase(itNames->first);
664                                         }
665                                 }
666                                 in.close();
667                         }
668                         remove(tempTaxonomyFile.c_str());
669                         
670                         if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } delete classify; return 0; }
671                         
672                         //print summary file
673                         ofstream outTaxTree;
674                         m->openOutputFile(taxSummary, outTaxTree);
675                         taxaSum.print(outTaxTree);
676                         outTaxTree.close();
677                         
678                         //output taxonomy with the unclassified bins added
679                         ifstream inTax;
680                         m->openInputFile(newTaxonomyFile, inTax);
681                         
682                         ofstream outTax;
683                         string unclass = newTaxonomyFile + ".unclass.temp";
684                         m->openOutputFile(unclass, outTax);
685                         
686                         //get maxLevel from phylotree so you know how many 'unclassified's to add
687                         int maxLevel = taxaSum.getMaxLevel();
688                                                         
689                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
690                         string name, taxon;
691                         while (!inTax.eof()) {
692                                 if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
693
694                                 inTax >> name >> taxon; m->gobble(inTax);
695                                 
696                                 string newTax = addUnclassifieds(taxon, maxLevel);
697                                 
698                                 outTax << name << '\t' << newTax << endl;
699                         }
700                         inTax.close();  
701                         outTax.close();
702                         
703                         remove(newTaxonomyFile.c_str());
704                         rename(unclass.c_str(), newTaxonomyFile.c_str());
705                         
706                         m->mothurOutEndLine();
707                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
708                         
709                         #ifdef USE_MPI  
710                                 }
711                         #endif
712
713                         m->mothurOutEndLine();
714                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
715                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
716                         m->mothurOutEndLine();
717                 }
718                 
719                 //set taxonomy file as new current taxonomyfile
720                 string current = "";
721                 itTypes = outputTypes.find("taxonomy");
722                 if (itTypes != outputTypes.end()) {
723                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
724                 }
725                 
726                 delete classify;
727                 return 0;
728         }
729         catch(exception& e) {
730                 m->errorOut(e, "ClassifySeqsCommand", "execute");
731                 exit(1);
732         }
733 }
734
735 /**************************************************************************************************/
736 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
737         try{
738                 string newTax, taxon;
739                 int level = 0;
740                 
741                 //keep what you have counting the levels
742                 while (tax.find_first_of(';') != -1) {
743                         //get taxon
744                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
745                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
746                         newTax += taxon;
747                         level++;
748                 }
749                 
750                 //add "unclassified" until you reach maxLevel
751                 while (level < maxlevel) {
752                         newTax += "unclassified;";
753                         level++;
754                 }
755                 
756                 return newTax;
757         }
758         catch(exception& e) {
759                 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
760                 exit(1);
761         }
762 }
763
764 /**************************************************************************************************/
765
766 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
767         try {
768 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
769                 int process = 1;
770                 int num = 0;
771                 
772                 //loop through and create all the processes you want
773                 while (process != processors) {
774                         int pid = fork();
775                         
776                         if (pid > 0) {
777                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
778                                 process++;
779                         }else if (pid == 0){
780                                 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
781                                 
782                                 //pass numSeqs to parent
783                                 ofstream out;
784                                 string tempFile = filename + toString(getpid()) + ".num.temp";
785                                 m->openOutputFile(tempFile, out);
786                                 out << num << endl;
787                                 out.close();
788
789                                 exit(0);
790                         }else { 
791                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
792                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
793                                 exit(0);
794                         }
795                 }
796                 
797                 //parent does its part
798                 num = driver(lines[0], taxFileName, tempTaxFile, filename);
799                 
800                 //force parent to wait until all the processes are done
801                 for (int i=0;i<processIDS.size();i++) { 
802                         int temp = processIDS[i];
803                         wait(&temp);
804                 }
805                 
806                 for (int i = 0; i < processIDS.size(); i++) {
807                         ifstream in;
808                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
809                         m->openInputFile(tempFile, in);
810                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
811                         in.close(); remove(tempFile.c_str());
812                 }
813                 
814                 for(int i=0;i<processIDS.size();i++){
815                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
816                         appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
817                         remove((taxFileName + toString(processIDS[i]) + ".temp").c_str());
818                         remove((tempTaxFile + toString(processIDS[i]) + ".temp").c_str());
819                 }
820                 
821                 return num;
822 #endif          
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
826                 exit(1);
827         }
828 }
829 /**************************************************************************************************/
830
831 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
832         try{
833                 
834                 ofstream output;
835                 ifstream input;
836                 m->openOutputFileAppend(filename, output);
837                 m->openInputFile(temp, input);
838                 
839                 while(char c = input.get()){
840                         if(input.eof())         {       break;                  }
841                         else                            {       output << c;    }
842                 }
843                 
844                 input.close();
845                 output.close();
846         }
847         catch(exception& e) {
848                 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
849                 exit(1);
850         }
851 }
852
853 //**********************************************************************************************************************
854
855 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){
856         try {
857                 ofstream outTax;
858                 m->openOutputFile(taxFName, outTax);
859                 
860                 ofstream outTaxSimple;
861                 m->openOutputFile(tempTFName, outTaxSimple);
862         
863                 ifstream inFASTA;
864                 m->openInputFile(filename, inFASTA);
865                 
866                 string taxonomy;
867
868                 inFASTA.seekg(filePos->start);
869
870                 bool done = false;
871                 int count = 0;
872                 
873                 while (!done) {
874                         if (m->control_pressed) { return 0; }
875                 
876                         Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
877                         
878                         if (candidateSeq->getName() != "") {
879                         
880                                 taxonomy = classify->getTaxonomy(candidateSeq);
881                                 
882                                 if (m->control_pressed) { delete candidateSeq; return 0; }
883
884                                 if (taxonomy != "bad seq") {
885                                         //output confidence scores or not
886                                         if (probs) {
887                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
888                                         }else{
889                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
890                                         }
891                                         
892                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
893                                 }
894                                 count++;
895                         }
896                         delete candidateSeq;
897                         
898                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
899                                 unsigned long int pos = inFASTA.tellg();
900                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
901                         #else
902                                 if (inFASTA.eof()) { break; }
903                         #endif
904                         
905                         //report progress
906                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
907                         
908                 }
909                 //report progress
910                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
911                         
912                 inFASTA.close();
913                 outTax.close();
914                 outTaxSimple.close();
915                 
916                 return count;
917         }
918         catch(exception& e) {
919                 m->errorOut(e, "ClassifySeqsCommand", "driver");
920                 exit(1);
921         }
922 }
923 //**********************************************************************************************************************
924 #ifdef USE_MPI
925 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
926         try {
927                 MPI_Status statusNew; 
928                 MPI_Status statusTemp; 
929                 MPI_Status status; 
930                 
931                 int pid;
932                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
933         
934                 string taxonomy;
935                 string outputString;
936
937                 for(int i=0;i<num;i++){
938                 
939                         if (m->control_pressed) { return 0; }
940                 
941                         //read next sequence
942                         int length = MPIPos[start+i+1] - MPIPos[start+i];
943                         char* buf4 = new char[length];
944                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
945                         
946                         string tempBuf = buf4;
947                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
948                         istringstream iss (tempBuf,istringstream::in);
949                         delete buf4;
950
951                         Sequence* candidateSeq = new Sequence(iss);
952                         
953                         if (candidateSeq->getName() != "") {
954                                 taxonomy = classify->getTaxonomy(candidateSeq);
955                                 
956                                 if (taxonomy != "bad seq") {
957                                         //output confidence scores or not
958                                         if (probs) {
959                                                 outputString =  candidateSeq->getName() + "\t" + taxonomy + "\n";
960                                         }else{
961                                                 outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
962                                         }
963                                         
964                                         int length = outputString.length();
965                                         char* buf2 = new char[length];
966                                         memcpy(buf2, outputString.c_str(), length);
967                                 
968                                         MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
969                                         delete buf2;
970
971                                         outputString =  candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
972                                         length = outputString.length();
973                                         char* buf = new char[length];
974                                         memcpy(buf, outputString.c_str(), length);
975                                 
976                                         MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
977                                         delete buf;
978                                 }
979                         }                               
980                         delete candidateSeq;
981                         
982                         if((i+1) % 100 == 0){   cout << "Classifying sequence " << (i+1) << endl;       }
983                 }
984                 
985                 if(num % 100 != 0){     cout << "Classifying sequence " << (num) << endl;       }
986                 
987                 
988                 return 1;
989         }
990         catch(exception& e) {
991                 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
992                 exit(1);
993         }
994 }
995
996 //**********************************************************************************************************************
997 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
998         try {
999         
1000                 nameMap.clear(); //remove old names
1001                 
1002                 MPI_File inMPI;
1003                 MPI_Offset size;
1004                 MPI_Status status;
1005
1006                 //char* inFileName = new char[nameFilename.length()];
1007                 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
1008                 
1009                 char inFileName[1024];
1010                 strcpy(inFileName, nameFilename.c_str());
1011
1012                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
1013                 MPI_File_get_size(inMPI, &size);
1014                 //delete inFileName;
1015
1016                 char* buffer = new char[size];
1017                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
1018
1019                 string tempBuf = buffer;
1020                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
1021                 istringstream iss (tempBuf,istringstream::in);
1022                 delete buffer;
1023                 
1024                 string firstCol, secondCol;
1025                 while(!iss.eof()) {
1026                         iss >> firstCol >> secondCol; m->gobble(iss);
1027                         
1028                         vector<string> temp;
1029                         m->splitAtComma(secondCol, temp);
1030                         
1031                         nameMap[firstCol] = temp;  
1032                 }
1033         
1034                 MPI_File_close(&inMPI);
1035                 
1036                 return 1;
1037         }
1038         catch(exception& e) {
1039                 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1040                 exit(1);
1041         }
1042 }
1043 #endif
1044 /**************************************************************************************************/