]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
13 #include "sharedutilities.h"
14
15 //**********************************************************************************************************************
16 vector<string> ClassifyOtuCommand::setParameters(){     
17         try {
18                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(plist);
19                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","constaxonomy",false,true,true); parameters.push_back(ptaxonomy);
20                 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(preftaxonomy);
21         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
22         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
23                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
24         CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppersample);
25         CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
26                 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "","",false,false); parameters.push_back(pbasis);
27                 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "","",false,true); parameters.push_back(pcutoff);
28                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string ClassifyOtuCommand::getHelpString(){     
43         try {
44                 string helpString = "";
45                 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, persample, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
46                 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
47                 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
48                 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
49                 helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n";
50         helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
51                 helpString += "For example consider the following basis=sequence could give Clostridiales       3       105     16      43      46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
52                 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
53                 helpString += "Now for basis=otu could give Clostridiales       3       7       6       1       2, where 7 is the number of otus that classified to Clostridiales.\n";
54                 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
55                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
56         helpString += "The persample parameter allows you to find a consensus taxonomy for each group. Default=f\n";
57                 helpString += "The default value for label is all labels in your inputfile.\n";
58                 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
59                 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
60                 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
61                 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
62                 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
63                 return helpString;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 string ClassifyOtuCommand::getOutputPattern(string type) {
72     try {
73         string pattern = "";
74         
75         if (type == "constaxonomy") {  pattern = "[filename],[distance],cons.taxonomy"; } 
76         else if (type == "taxsummary") {  pattern = "[filename],[distance],cons.tax.summary"; } 
77         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
78         
79         return pattern;
80     }
81     catch(exception& e) {
82         m->errorOut(e, "ClassifyOtuCommand", "getOutputPattern");
83         exit(1);
84     }
85 }
86 //**********************************************************************************************************************
87 ClassifyOtuCommand::ClassifyOtuCommand(){       
88         try {
89                 abort = true; calledHelp = true; 
90                 setParameters();
91                 vector<string> tempOutNames;
92                 outputTypes["constaxonomy"] = tempOutNames;
93                 outputTypes["taxsummary"] = tempOutNames;
94         }
95         catch(exception& e) {
96                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
97                 exit(1);
98         }
99 }
100
101 //**********************************************************************************************************************
102 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
103         try{
104                 abort = false; calledHelp = false;   
105                 allLines = 1;
106                 labels.clear();
107                                 
108                 //allow user to run help
109                 if (option == "help") { 
110                         help(); abort = true; calledHelp = true;
111                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
112                 else {
113                         vector<string> myArray = setParameters();
114                         
115                         OptionParser parser(option);
116                         map<string, string> parameters = parser.getParameters();
117                         
118                         ValidParameters validParameter;
119                         map<string, string>::iterator it;
120                 
121                         //check to make sure all parameters are valid for command
122                         for (it = parameters.begin(); it != parameters.end(); it++) { 
123                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
124                         }
125                         
126                         //initialize outputTypes
127                         vector<string> tempOutNames;
128                         outputTypes["constaxonomy"] = tempOutNames;
129                         outputTypes["taxsummary"] = tempOutNames;
130                 
131                         //if the user changes the input directory command factory will send this info to us in the output parameter 
132                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
133                         if (inputDir == "not found"){   inputDir = "";          }
134                         else {
135                                 string path;
136                                 it = parameters.find("list");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
142                                 }
143                                 
144                                 it = parameters.find("name");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
150                                 }
151                                 
152                                 it = parameters.find("taxonomy");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
158                                 }
159                                 
160                                 it = parameters.find("reftaxonomy");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
166                                 }
167                                 
168                                 it = parameters.find("group");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
174                                 }
175                 
176                 it = parameters.find("count");
177                                 //user has given a template file
178                                 if(it != parameters.end()){ 
179                                         path = m->hasPath(it->second);
180                                         //if the user has not given a path then, add inputdir. else leave path alone.
181                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
182                                 }
183                         }
184
185                         
186                         //if the user changes the output directory command factory will send this info to us in the output parameter 
187                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
188                         
189                         //check for required parameters
190                         listfile = validParameter.validFile(parameters, "list", true);
191                         if (listfile == "not found") {                          
192                                 //if there is a current list file, use it
193                                 listfile = m->getListFile(); 
194                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
195                                 else {  m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
196                         }
197                         else if (listfile == "not open") { abort = true; }      
198                         else { m->setListFile(listfile); }
199                         
200                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
201                         if (taxfile == "not found") {  //if there is a current list file, use it
202                                 taxfile = m->getTaxonomyFile(); 
203                                 if (taxfile != "") {  m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
204                                 else {  m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
205                         }
206                         else if (taxfile == "not open") { abort = true; }
207                         else { m->setTaxonomyFile(taxfile); }
208                         
209                         refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
210                         if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
211                         else if (refTaxonomy == "not open") { abort = true; }
212         
213                         namefile = validParameter.validFile(parameters, "name", true);
214                         if (namefile == "not open") { namefile = ""; abort = true; }    
215                         else if (namefile == "not found") { namefile = ""; }
216                         else { m->setNameFile(namefile); }
217                         
218                         groupfile = validParameter.validFile(parameters, "group", true);
219                         if (groupfile == "not open") { abort = true; }  
220                         else if (groupfile == "not found") { groupfile = ""; }
221                         else { m->setGroupFile(groupfile); }
222             
223             countfile = validParameter.validFile(parameters, "count", true);
224                         if (countfile == "not open") { countfile = ""; abort = true; }
225                         else if (countfile == "not found") { countfile = "";  } 
226                         else { m->setCountTableFile(countfile); }
227             
228             if ((namefile != "") && (countfile != "")) {
229                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
230             }
231                         
232             if ((groupfile != "") && (countfile != "")) {
233                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
234             }
235
236                         
237                         //check for optional parameter and set defaults
238                         // ...at some point should added some additional type checking...
239                         label = validParameter.validFile(parameters, "label", false);                   
240                         if (label == "not found") { label = ""; allLines = 1;  }
241                         else { 
242                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
243                                 else { allLines = 1;  }
244                         }
245             
246                         basis = validParameter.validFile(parameters, "basis", false);
247                         if (basis == "not found") { basis = "otu"; }    
248                         
249                         if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
250                         
251                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
252                         m->mothurConvert(temp, cutoff); 
253                         
254                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
255                         probs = m->isTrue(temp);
256             
257             temp = validParameter.validFile(parameters, "persample", false);            if (temp == "not found"){       temp = "f";             }
258                         persample = m->isTrue(temp);
259                         
260             if ((groupfile == "") && (countfile == "")) { if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } 
261             }
262             if (countfile != "") {
263                 CountTable cts;
264                 if (!cts.testGroups(countfile)) { 
265                     if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; }
266                 }
267             }
268                         
269                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
270                         
271             if (countfile == "") {
272                 if (namefile == ""){
273                     vector<string> files; files.push_back(taxfile);
274                     parser.getNameFile(files);
275                 }
276             }
277                         
278                 }
279         }
280         catch(exception& e) {
281                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
282                 exit(1);
283         }
284 }
285 //**********************************************************************************************************************
286
287 int ClassifyOtuCommand::execute(){
288         try {
289         
290                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
291                 
292                 //if user gave a namesfile then use it
293                 if (namefile != "")     {       m->readNames(namefile, nameMap, true);  }
294         if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap();  groups = groupMap->getNamesOfGroups(); }
295         else { groupMap = NULL;  }
296         if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile, true);  if (ct->hasGroupInfo()) { groups = ct->getNamesOfGroups(); } }
297         else {  ct = NULL;    }
298         
299                 //read taxonomy file and save in map for easy access in building bin trees
300                 m->readTax(taxfile, taxMap);
301                 
302                 if (m->control_pressed) { return 0; }
303                 
304                 input = new InputData(listfile, "list");
305                 list = input->getListVector();
306                 string lastLabel = list->getLabel();
307
308                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
309                 set<string> processedLabels;
310                 set<string> userLabels = labels;
311                 
312                 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  }  return 0; }
313         
314                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
315                         
316                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
317                         
318                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
319                                         process(list);
320                                         if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; }
321                                                                                 
322                                         processedLabels.insert(list->getLabel());
323                                         userLabels.erase(list->getLabel());
324                         }
325                         
326                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
327                                         string saveLabel = list->getLabel();
328                                         
329                                         delete list;
330                                         list = input->getListVector(lastLabel);
331                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
332                                         process(list);
333                                 
334                                         
335                                         if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
336                                                                                 
337                                         processedLabels.insert(list->getLabel());
338                                         userLabels.erase(list->getLabel());
339                                         
340                                         //restore real lastlabel to save below
341                                         list->setLabel(saveLabel);
342                         }
343                         
344                         lastLabel = list->getLabel();
345         
346                         delete list;
347                         list = input->getListVector();
348                 }
349                 
350                 //output error messages about any remaining user labels
351                 bool needToRun = false;
352                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
353                         m->mothurOut("Your file does not include the label " + (*it)); 
354                         if (processedLabels.count(lastLabel) != 1) {
355                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
356                                 needToRun = true;
357                         }else {
358                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
359                         }
360                 }
361                 
362                 //run last label if you need to
363                 if (needToRun == true)  {
364                         if (list != NULL) {     delete list;    }
365                         list = input->getListVector(lastLabel);
366                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
367                         
368                         process(list);
369                         delete list;
370                         
371                         if (m->control_pressed) { outputTypes.clear();  if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
372                 }
373                 
374                 delete input;  
375         if (groupMap != NULL) { delete groupMap; }
376         if (ct != NULL) { delete ct; }
377                                 
378                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
379                 
380                 m->mothurOutEndLine();
381                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
383                 m->mothurOutEndLine();
384                 
385                 return 0;
386         }
387         catch(exception& e) {
388                 m->errorOut(e, "ClassifyOtuCommand", "execute");
389                 exit(1);
390         }
391 }
392 //**********************************************************************************************************************
393 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) {
394         try{
395                 conTax = "";
396                 vector<string> allNames;
397                 map<string, string>::iterator it;
398                 map<string, string>::iterator it2;
399
400                 //create a tree containing sequences from this bin
401                 PhyloTree* phylo = new PhyloTree();
402                 
403                 size = 0;
404                 for (int i = 0; i < names.size(); i++) {
405         
406                         //if namesfile include the names
407                         if (namefile != "") {
408         
409                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
410                                 it2 = nameMap.find(names[i]);
411                                 
412                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
413                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
414                                 }else{
415                                         
416                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
417                                         it = taxMap.find(it2->second);
418                         
419                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
420                                         
421                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
422                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
423                                         }else{
424                                 
425                                                 //add seq to tree
426                                                 phylo->addSeqToTree(names[i], it->second);
427                                                 size++;
428                                                 allNames.push_back(names[i]);
429                                         }
430                                 }
431                                 
432                         }else{
433                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
434                                 it = taxMap.find(names[i]);
435                 
436                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
437                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
438                                 }else{
439                     if (countfile != "") {
440                         int numDups = ct->getNumSeqs(names[i]); 
441                         for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree(names[i], it->second);  }
442                         size += numDups;
443                     }else{
444                                         //add seq to tree
445                         phylo->addSeqToTree(names[i], it->second);
446                         size++;  
447                     }
448                     allNames.push_back(names[i]);
449                                 }
450                         }
451
452                         
453                         if (m->control_pressed) { delete phylo; return allNames; }
454                         
455                 }
456                 
457                 //build tree
458                 phylo->assignHeirarchyIDs(0);
459                 
460                 TaxNode currentNode = phylo->get(0);
461                 int myLevel = 0;        
462                 //at each level
463                 while (currentNode.children.size() != 0) { //you still have more to explore
464                 
465                         TaxNode bestChild;
466                         int bestChildSize = 0;
467                         
468                         //go through children
469                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
470                                 
471                                 TaxNode temp = phylo->get(itChild->second);
472                                 
473                                 //select child with largest accesions - most seqs assigned to it
474                                 if (temp.accessions.size() > bestChildSize) {
475                                         bestChild = phylo->get(itChild->second);
476                                         bestChildSize = temp.accessions.size();
477                                 }
478                                 
479                         }
480             
481             //phylotree adds an extra unknown so we want to remove that
482             if (bestChild.name == "unknown") { bestChildSize--; }
483                                 
484                         //is this taxonomy above cutoff
485                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
486                         
487                         if (consensusConfidence >= cutoff) { //if yes, add it
488                                 if (probs) {
489                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
490                                 }else{
491                                         conTax += bestChild.name + ";";
492                                 }
493                                 myLevel++;
494                         }else{ //if no, quit
495                                 break;
496                         }
497                         
498                         //move down a level
499                         currentNode = bestChild;
500                 }
501                 
502                 if (myLevel != phylo->getMaxLevel()) {
503                         while (myLevel != phylo->getMaxLevel()) {
504                                 conTax += "unclassified;";
505                                 myLevel++;
506                         }
507                 }               
508                 if (conTax == "") {  conTax = "no_consensus;";  }
509                 
510                 delete phylo;   
511                 
512                 return allNames;
513                         
514         }
515         catch(exception& e) {
516                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
517                 exit(1);
518         }
519 }
520
521 //**********************************************************************************************************************
522 int ClassifyOtuCommand::process(ListVector* processList) {
523         try{
524                 string conTax;
525                 int size;
526                 
527                 //create output file
528                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
529                                 
530                 ofstream out;
531         map<string, string> variables; 
532         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
533         variables["[distance]"] = processList->getLabel();
534                 string outputFile = getOutputFileName("constaxonomy", variables);
535                 m->openOutputFile(outputFile, out);
536                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
537                 
538                 ofstream outSum;
539                 string outputSumFile = getOutputFileName("taxsummary", variables);
540                 m->openOutputFile(outputSumFile, outSum);
541                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
542                 
543                 out << "OTU\tSize\tTaxonomy" << endl;
544                 
545                 PhyloSummary* taxaSum;
546         if (countfile != "") {
547             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct);  }
548             else {  taxaSum = new PhyloSummary(ct); }
549                 }else {
550             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap);  }
551             else {  taxaSum = new PhyloSummary(groupMap); }
552         }
553         
554         vector<ofstream*> outSums;
555         vector<ofstream*> outs;
556         vector<PhyloSummary*> taxaSums;
557         map<string, int> groupIndex;
558         if (persample) {
559             for (int i = 0; i < groups.size(); i++) {
560                 groupIndex[groups[i]] = i;
561                 ofstream* temp = new ofstream();
562                 variables["[distance]"] = processList->getLabel() + "." + groups[i];
563                 string outputFile = getOutputFileName("constaxonomy", variables);
564                 m->openOutputFile(outputFile, *temp);
565                 (*temp) << "OTU\tSize\tTaxonomy" << endl;
566                 outs.push_back(temp);
567                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
568                 
569                 ofstream* tempSum = new ofstream();
570                 string outputSumFile = getOutputFileName("taxsummary", variables);
571                 m->openOutputFile(outputSumFile, *tempSum);
572                 outSums.push_back(tempSum);
573                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
574                 
575                 PhyloSummary* taxaSumt;
576                 if (countfile != "") {
577                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, ct);  }
578                     else {  taxaSumt = new PhyloSummary(ct); }
579                 }else {
580                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, groupMap);  }
581                     else {  taxaSumt = new PhyloSummary(groupMap); }
582                 }
583                 taxaSums.push_back(taxaSumt);
584             }
585         }
586         
587                 //for each bin in the list vector
588         string snumBins = toString(processList->getNumBins());
589                 for (int i = 0; i < processList->getNumBins(); i++) {
590                         
591                         if (m->control_pressed) { break; }
592                         
593                         vector<string> names;
594             string binnames = processList->get(i);
595             vector<string> thisNames;
596             m->splitAtComma(binnames, thisNames);
597             
598                         names = findConsensusTaxonomy(thisNames, size, conTax);
599                 
600                         if (m->control_pressed) { break; }
601                         
602                         //output to new names file
603             string binLabel = "Otu";
604             string sbinNumber = toString(i+1);
605             if (sbinNumber.length() < snumBins.length()) { 
606                 int diff = snumBins.length() - sbinNumber.length();
607                 for (int h = 0; h < diff; h++) { binLabel += "0"; }
608             }
609             binLabel += sbinNumber;
610
611                         out << binLabel << '\t' << size << '\t' << conTax << endl;
612                         
613                         string noConfidenceConTax = conTax;
614                         m->removeConfidences(noConfidenceConTax);
615                         
616                         //add this bins taxonomy to summary
617                         if (basis == "sequence") {
618                                 for(int j = 0; j < names.size(); j++) {  
619                     //int numReps = 1;
620                     //if (countfile != "") {  numReps = ct->getNumSeqs(names[j]); }
621                     //for(int k = 0; k < numReps; k++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
622                     taxaSum->addSeqToTree(names[j], noConfidenceConTax);
623                 }
624                         }else { //otu
625                 map<string, bool> containsGroup; 
626                 if (countfile != "") {
627                     if (ct->hasGroupInfo()) {
628                         vector<string> mGroups = ct->getNamesOfGroups();
629                         for (int k = 0; k < names.size(); k++) {
630                             vector<int> counts = ct->getGroupCounts(names[k]);
631                             for (int h = 0; h < counts.size(); h++) {  
632                                 if (counts[h] != 0) {  containsGroup[mGroups[h]] = true; }
633                             }
634                         }
635                     }
636                 }else {
637                     if (groupfile != "") {
638                         vector<string> mGroups = groupMap->getNamesOfGroups();
639                         for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; }
640                         
641                         for (int k = 0; k < names.size(); k++) {
642                             //find out the sequences group
643                             string group = groupMap->getGroup(names[k]);
644                             
645                             if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
646                             else {
647                                 containsGroup[group] = true;
648                             }
649                         }
650                     }
651                 }
652                                 taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
653                         }
654             
655             
656             if (persample) {
657                 //divide names by group
658                 map<string, vector<string> > parsedNames;
659                 map<string, vector<string> >::iterator itParsed;
660                 
661                 //parse names by group
662                 for (int j = 0; j < names.size(); j++) {
663                     if (groupfile != "") { 
664                         string group = groupMap->getGroup(names[j]); 
665                         itParsed = parsedNames.find(group);
666                         
667                         if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
668                         else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
669                     }else { //count file was used
670                         vector<string> thisSeqsGroups = ct->getGroups(names[j]);
671                         for (int k = 0; k < thisSeqsGroups.size(); k++) {
672                             string group = thisSeqsGroups[k]; 
673                             itParsed = parsedNames.find(group);
674                             
675                             if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
676                             else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
677                         }
678                     }
679                 }
680                 
681                 for (itParsed = parsedNames.begin(); itParsed != parsedNames.end(); itParsed++) {
682                     vector<string> theseNames = findConsensusTaxonomy(itParsed->second, size, conTax);
683                     
684                     if (m->control_pressed) { break; }
685                     
686                     //output to new names file
687                     string binLabel = "Otu";
688                     string sbinNumber = toString(i+1);
689                     if (sbinNumber.length() < snumBins.length()) { 
690                         int diff = snumBins.length() - sbinNumber.length();
691                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
692                     }
693                     binLabel += sbinNumber;
694                     
695                     (*outs[groupIndex[itParsed->first]]) << binLabel << '\t' << size << '\t' << conTax << endl;
696                     
697                     string noConfidenceConTax = conTax;
698                     m->removeConfidences(noConfidenceConTax);
699                     
700                     //add this bins taxonomy to summary
701                     if (basis == "sequence") {
702                         for(int j = 0; j < theseNames.size(); j++) {  
703                             int numReps = 1;
704                             if (countfile != "") {  numReps = ct->getGroupCount(theseNames[j], itParsed->first); } //get num seqs for this seq from this group
705                             for(int k = 0; k < numReps; k++) {  (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(theseNames[j], noConfidenceConTax);  }
706                         }
707                     }else { //otu
708                         map<string, bool> containsGroup; 
709                         containsGroup[itParsed->first] = true;
710                         (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(noConfidenceConTax, containsGroup);
711                     }
712                 }
713             }
714                 }
715
716                 out.close();
717                 
718                 //print summary file
719                 taxaSum->print(outSum);
720                 outSum.close();
721         
722         if (persample) {
723             for (int i = 0; i < groups.size(); i++) {
724                 (*outs[i]).close();
725                 taxaSums[i]->print(*outSums[i]);
726                 (*outSums[i]).close();
727                 delete outs[i];
728                 delete outSums[i];
729                 delete taxaSums[i];
730             }
731         }
732                 
733                 delete taxaSum;
734                 
735                 return 0;
736
737         }
738         catch(exception& e) {
739                 m->errorOut(e, "ClassifyOtuCommand", "process");
740                 exit(1);
741         }
742 }
743 /**************************************************************************************************/
744 string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) {
745         try{
746                 string newTax, taxon;
747                 int level = 0;
748                 
749                 //keep what you have counting the levels
750                 while (tax.find_first_of(';') != -1) {
751                         //get taxon
752                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
753                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
754                         newTax += taxon;
755                         level++;
756                 }
757                 
758                 //add "unclassified" until you reach maxLevel
759                 while (level < maxlevel) {
760                         newTax += "unclassified;";
761                         level++;
762                 }
763                 
764                 return newTax;
765         }
766         catch(exception& e) {
767                 m->errorOut(e, "ClassifyOtuCommand", "addUnclassifieds");
768                 exit(1);
769         }
770 }
771 //**********************************************************************************************************************
772
773