]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
13 #include "sharedutilities.h"
14
15 //**********************************************************************************************************************
16 vector<string> ClassifyOtuCommand::setParameters(){     
17         try {
18                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(plist);
19                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","constaxonomy",false,true,true); parameters.push_back(ptaxonomy);
20                 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(preftaxonomy);
21         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
22         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
23                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
24         CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppersample);
25         CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
26                 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "","",false,false); parameters.push_back(pbasis);
27                 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "","",false,true); parameters.push_back(pcutoff);
28                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string ClassifyOtuCommand::getHelpString(){     
43         try {
44                 string helpString = "";
45                 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
46                 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
47                 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
48                 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
49                 helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n";
50         helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
51                 helpString += "For example consider the following basis=sequence could give Clostridiales       3       105     16      43      46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
52                 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
53                 helpString += "Now for basis=otu could give Clostridiales       3       7       6       1       2, where 7 is the number of otus that classified to Clostridiales.\n";
54                 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
55                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
56                 helpString += "The default value for label is all labels in your inputfile.\n";
57                 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
58                 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
59                 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
60                 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
61                 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
62                 return helpString;
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
66                 exit(1);
67         }
68 }
69 //**********************************************************************************************************************
70 string ClassifyOtuCommand::getOutputPattern(string type) {
71     try {
72         string pattern = "";
73         
74         if (type == "constaxonomy") {  pattern = "[filename],[distance],cons.taxonomy"; } 
75         else if (type == "taxsummary") {  pattern = "[filename],[distance],cons.tax.summary"; } 
76         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
77         
78         return pattern;
79     }
80     catch(exception& e) {
81         m->errorOut(e, "ClassifyOtuCommand", "getOutputPattern");
82         exit(1);
83     }
84 }
85 //**********************************************************************************************************************
86 ClassifyOtuCommand::ClassifyOtuCommand(){       
87         try {
88                 abort = true; calledHelp = true; 
89                 setParameters();
90                 vector<string> tempOutNames;
91                 outputTypes["constaxonomy"] = tempOutNames;
92                 outputTypes["taxsummary"] = tempOutNames;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
96                 exit(1);
97         }
98 }
99
100 //**********************************************************************************************************************
101 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
102         try{
103                 abort = false; calledHelp = false;   
104                 allLines = 1;
105                 labels.clear();
106                                 
107                 //allow user to run help
108                 if (option == "help") { 
109                         help(); abort = true; calledHelp = true;
110                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string, string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string, string>::iterator it;
119                 
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126                         vector<string> tempOutNames;
127                         outputTypes["constaxonomy"] = tempOutNames;
128                         outputTypes["taxsummary"] = tempOutNames;
129                 
130                         //if the user changes the input directory command factory will send this info to us in the output parameter 
131                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
132                         if (inputDir == "not found"){   inputDir = "";          }
133                         else {
134                                 string path;
135                                 it = parameters.find("list");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
141                                 }
142                                 
143                                 it = parameters.find("name");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
149                                 }
150                                 
151                                 it = parameters.find("taxonomy");
152                                 //user has given a template file
153                                 if(it != parameters.end()){ 
154                                         path = m->hasPath(it->second);
155                                         //if the user has not given a path then, add inputdir. else leave path alone.
156                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
157                                 }
158                                 
159                                 it = parameters.find("reftaxonomy");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
165                                 }
166                                 
167                                 it = parameters.find("group");
168                                 //user has given a template file
169                                 if(it != parameters.end()){ 
170                                         path = m->hasPath(it->second);
171                                         //if the user has not given a path then, add inputdir. else leave path alone.
172                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
173                                 }
174                 
175                 it = parameters.find("count");
176                                 //user has given a template file
177                                 if(it != parameters.end()){ 
178                                         path = m->hasPath(it->second);
179                                         //if the user has not given a path then, add inputdir. else leave path alone.
180                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
181                                 }
182                         }
183
184                         
185                         //if the user changes the output directory command factory will send this info to us in the output parameter 
186                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
187                         
188                         //check for required parameters
189                         listfile = validParameter.validFile(parameters, "list", true);
190                         if (listfile == "not found") {                          
191                                 //if there is a current list file, use it
192                                 listfile = m->getListFile(); 
193                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
194                                 else {  m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
195                         }
196                         else if (listfile == "not open") { abort = true; }      
197                         else { m->setListFile(listfile); }
198                         
199                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
200                         if (taxfile == "not found") {  //if there is a current list file, use it
201                                 taxfile = m->getTaxonomyFile(); 
202                                 if (taxfile != "") {  m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
203                                 else {  m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
204                         }
205                         else if (taxfile == "not open") { abort = true; }
206                         else { m->setTaxonomyFile(taxfile); }
207                         
208                         refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
209                         if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
210                         else if (refTaxonomy == "not open") { abort = true; }
211         
212                         namefile = validParameter.validFile(parameters, "name", true);
213                         if (namefile == "not open") { namefile = ""; abort = true; }    
214                         else if (namefile == "not found") { namefile = ""; }
215                         else { m->setNameFile(namefile); }
216                         
217                         groupfile = validParameter.validFile(parameters, "group", true);
218                         if (groupfile == "not open") { abort = true; }  
219                         else if (groupfile == "not found") { groupfile = ""; }
220                         else { m->setGroupFile(groupfile); }
221             
222             countfile = validParameter.validFile(parameters, "count", true);
223                         if (countfile == "not open") { countfile = ""; abort = true; }
224                         else if (countfile == "not found") { countfile = "";  } 
225                         else { m->setCountTableFile(countfile); }
226             
227             if ((namefile != "") && (countfile != "")) {
228                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
229             }
230                         
231             if ((groupfile != "") && (countfile != "")) {
232                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
233             }
234
235                         
236                         //check for optional parameter and set defaults
237                         // ...at some point should added some additional type checking...
238                         label = validParameter.validFile(parameters, "label", false);                   
239                         if (label == "not found") { label = ""; allLines = 1;  }
240                         else { 
241                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
242                                 else { allLines = 1;  }
243                         }
244             
245                         basis = validParameter.validFile(parameters, "basis", false);
246                         if (basis == "not found") { basis = "otu"; }    
247                         
248                         if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
249                         
250                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
251                         m->mothurConvert(temp, cutoff); 
252                         
253                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
254                         probs = m->isTrue(temp);
255             
256             temp = validParameter.validFile(parameters, "persample", false);            if (temp == "not found"){       temp = "f";             }
257                         persample = m->isTrue(temp);
258                         
259             if ((groupfile == "") && (countfile == "")) { if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } 
260             }
261             if (countfile != "") {
262                 CountTable cts;
263                 if (!cts.testGroups(countfile)) { 
264                     if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; }
265                 }
266             }
267                         
268                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
269                         
270             if (countfile == "") {
271                 if (namefile == ""){
272                     vector<string> files; files.push_back(taxfile);
273                     parser.getNameFile(files);
274                 }
275             }
276                         
277                 }
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
281                 exit(1);
282         }
283 }
284 //**********************************************************************************************************************
285
286 int ClassifyOtuCommand::execute(){
287         try {
288         
289                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
290                 
291                 //if user gave a namesfile then use it
292                 if (namefile != "")     {       m->readNames(namefile, nameMap, true);  }
293         if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap();  groups = groupMap->getNamesOfGroups(); }
294         else { groupMap = NULL;  }
295         if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile);  if (ct->hasGroupInfo()) { groups = ct->getNamesOfGroups(); } }
296         else {  ct = NULL;    }
297         
298                 //read taxonomy file and save in map for easy access in building bin trees
299                 m->readTax(taxfile, taxMap);
300                 
301                 if (m->control_pressed) { return 0; }
302                 
303                 input = new InputData(listfile, "list");
304                 list = input->getListVector();
305                 string lastLabel = list->getLabel();
306
307                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
308                 set<string> processedLabels;
309                 set<string> userLabels = labels;
310                 
311                 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  }  return 0; }
312         
313                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
314                         
315                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
316                         
317                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
318                                         process(list);
319                                         if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; }
320                                                                                 
321                                         processedLabels.insert(list->getLabel());
322                                         userLabels.erase(list->getLabel());
323                         }
324                         
325                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
326                                         string saveLabel = list->getLabel();
327                                         
328                                         delete list;
329                                         list = input->getListVector(lastLabel);
330                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
331                                         process(list);
332                                 
333                                         
334                                         if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
335                                                                                 
336                                         processedLabels.insert(list->getLabel());
337                                         userLabels.erase(list->getLabel());
338                                         
339                                         //restore real lastlabel to save below
340                                         list->setLabel(saveLabel);
341                         }
342                         
343                         lastLabel = list->getLabel();
344         
345                         delete list;
346                         list = input->getListVector();
347                 }
348                 
349                 //output error messages about any remaining user labels
350                 bool needToRun = false;
351                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
352                         m->mothurOut("Your file does not include the label " + (*it)); 
353                         if (processedLabels.count(lastLabel) != 1) {
354                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
355                                 needToRun = true;
356                         }else {
357                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
358                         }
359                 }
360                 
361                 //run last label if you need to
362                 if (needToRun == true)  {
363                         if (list != NULL) {     delete list;    }
364                         list = input->getListVector(lastLabel);
365                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
366                         
367                         process(list);
368                         delete list;
369                         
370                         if (m->control_pressed) { outputTypes.clear();  if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
371                 }
372                 
373                 delete input;  
374         if (groupMap != NULL) { delete groupMap; }
375         if (ct != NULL) { delete ct; }
376                                 
377                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
378                 
379                 m->mothurOutEndLine();
380                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
381                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
382                 m->mothurOutEndLine();
383                 
384                 return 0;
385         }
386         catch(exception& e) {
387                 m->errorOut(e, "ClassifyOtuCommand", "execute");
388                 exit(1);
389         }
390 }
391 //**********************************************************************************************************************
392 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) {
393         try{
394                 conTax = "";
395                 vector<string> allNames;
396                 map<string, string>::iterator it;
397                 map<string, string>::iterator it2;
398
399                 //create a tree containing sequences from this bin
400                 PhyloTree* phylo = new PhyloTree();
401                 
402                 size = 0;
403                 for (int i = 0; i < names.size(); i++) {
404         
405                         //if namesfile include the names
406                         if (namefile != "") {
407         
408                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
409                                 it2 = nameMap.find(names[i]);
410                                 
411                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
412                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
413                                 }else{
414                                         
415                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
416                                         it = taxMap.find(it2->second);
417                         
418                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
419                                         
420                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
421                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
422                                         }else{
423                                 
424                                                 //add seq to tree
425                                                 phylo->addSeqToTree(names[i], it->second);
426                                                 size++;
427                                                 allNames.push_back(names[i]);
428                                         }
429                                 }
430                                 
431                         }else{
432                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
433                                 it = taxMap.find(names[i]);
434                 
435                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
436                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
437                                 }else{
438                     if (countfile != "") {
439                         int numDups = ct->getNumSeqs(names[i]); 
440                         for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree(names[i], it->second);  }
441                         size += numDups;
442                     }else{
443                                         //add seq to tree
444                         phylo->addSeqToTree(names[i], it->second);
445                         size++;  
446                     }
447                     allNames.push_back(names[i]);
448                                 }
449                         }
450
451                         
452                         if (m->control_pressed) { delete phylo; return allNames; }
453                         
454                 }
455                 
456                 //build tree
457                 phylo->assignHeirarchyIDs(0);
458                 
459                 TaxNode currentNode = phylo->get(0);
460                 int myLevel = 0;        
461                 //at each level
462                 while (currentNode.children.size() != 0) { //you still have more to explore
463                 
464                         TaxNode bestChild;
465                         int bestChildSize = 0;
466                         
467                         //go through children
468                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
469                                 
470                                 TaxNode temp = phylo->get(itChild->second);
471                                 
472                                 //select child with largest accesions - most seqs assigned to it
473                                 if (temp.accessions.size() > bestChildSize) {
474                                         bestChild = phylo->get(itChild->second);
475                                         bestChildSize = temp.accessions.size();
476                                 }
477                                 
478                         }
479             
480             //phylotree adds an extra unknown so we want to remove that
481             if (bestChild.name == "unknown") { bestChildSize--; }
482                                 
483                         //is this taxonomy above cutoff
484                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
485                         
486                         if (consensusConfidence >= cutoff) { //if yes, add it
487                                 if (probs) {
488                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
489                                 }else{
490                                         conTax += bestChild.name + ";";
491                                 }
492                                 myLevel++;
493                         }else{ //if no, quit
494                                 break;
495                         }
496                         
497                         //move down a level
498                         currentNode = bestChild;
499                 }
500                 
501                 if (myLevel != phylo->getMaxLevel()) {
502                         while (myLevel != phylo->getMaxLevel()) {
503                                 conTax += "unclassified;";
504                                 myLevel++;
505                         }
506                 }               
507                 if (conTax == "") {  conTax = "no_consensus;";  }
508                 
509                 delete phylo;   
510                 
511                 return allNames;
512                         
513         }
514         catch(exception& e) {
515                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
516                 exit(1);
517         }
518 }
519
520 //**********************************************************************************************************************
521 int ClassifyOtuCommand::process(ListVector* processList) {
522         try{
523                 string conTax;
524                 int size;
525                 
526                 //create output file
527                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
528                                 
529                 ofstream out;
530         map<string, string> variables; 
531         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
532         variables["[distance]"] = processList->getLabel();
533                 string outputFile = getOutputFileName("constaxonomy", variables);
534                 m->openOutputFile(outputFile, out);
535                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
536                 
537                 ofstream outSum;
538                 string outputSumFile = getOutputFileName("taxsummary", variables);
539                 m->openOutputFile(outputSumFile, outSum);
540                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
541                 
542                 out << "OTU\tSize\tTaxonomy" << endl;
543                 
544                 PhyloSummary* taxaSum;
545         if (countfile != "") {
546             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct);  }
547             else {  taxaSum = new PhyloSummary(ct); }
548                 }else {
549             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap);  }
550             else {  taxaSum = new PhyloSummary(groupMap); }
551         }
552         
553         vector<ofstream*> outSums;
554         vector<ofstream*> outs;
555         vector<PhyloSummary*> taxaSums;
556         map<string, int> groupIndex;
557         if (persample) {
558             for (int i = 0; i < groups.size(); i++) {
559                 groupIndex[groups[i]] = i;
560                 ofstream* temp = new ofstream();
561                 variables["[distance]"] = processList->getLabel() + "." + groups[i];
562                 string outputFile = getOutputFileName("constaxonomy", variables);
563                 m->openOutputFile(outputFile, *temp);
564                 (*temp) << "OTU\tSize\tTaxonomy" << endl;
565                 outs.push_back(temp);
566                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
567                 
568                 ofstream* tempSum = new ofstream();
569                 string outputSumFile = getOutputFileName("taxsummary", variables);
570                 m->openOutputFile(outputSumFile, *tempSum);
571                 outSums.push_back(tempSum);
572                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
573                 
574                 PhyloSummary* taxaSumt;
575                 if (countfile != "") {
576                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, ct);  }
577                     else {  taxaSumt = new PhyloSummary(ct); }
578                 }else {
579                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, groupMap);  }
580                     else {  taxaSumt = new PhyloSummary(groupMap); }
581                 }
582                 taxaSums.push_back(taxaSumt);
583             }
584         }
585         
586                 //for each bin in the list vector
587         string snumBins = toString(processList->getNumBins());
588                 for (int i = 0; i < processList->getNumBins(); i++) {
589                         
590                         if (m->control_pressed) { break; }
591                         
592                         vector<string> names;
593             string binnames = processList->get(i);
594             vector<string> thisNames;
595             m->splitAtComma(binnames, thisNames);
596             
597                         names = findConsensusTaxonomy(thisNames, size, conTax);
598                 
599                         if (m->control_pressed) { break; }
600                         
601                         //output to new names file
602             string binLabel = "Otu";
603             string sbinNumber = toString(i+1);
604             if (sbinNumber.length() < snumBins.length()) { 
605                 int diff = snumBins.length() - sbinNumber.length();
606                 for (int h = 0; h < diff; h++) { binLabel += "0"; }
607             }
608             binLabel += sbinNumber;
609
610                         out << binLabel << '\t' << size << '\t' << conTax << endl;
611                         
612                         string noConfidenceConTax = conTax;
613                         m->removeConfidences(noConfidenceConTax);
614                         
615                         //add this bins taxonomy to summary
616                         if (basis == "sequence") {
617                                 for(int j = 0; j < names.size(); j++) {  
618                     int numReps = 1;
619                     if (countfile != "") {  numReps = ct->getNumSeqs(names[j]); }
620                     for(int k = 0; k < numReps; k++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
621                 }
622                         }else { //otu
623                 map<string, bool> containsGroup; 
624                 if (countfile != "") {
625                     if (ct->hasGroupInfo()) {
626                         vector<string> mGroups = ct->getNamesOfGroups();
627                         for (int k = 0; k < names.size(); k++) {
628                             vector<int> counts = ct->getGroupCounts(names[k]);
629                             for (int h = 0; h < counts.size(); h++) {  
630                                 if (counts[h] != 0) {  containsGroup[mGroups[h]] = true; }
631                             }
632                         }
633                     }
634                 }else {
635                     if (groupfile != "") {
636                         vector<string> mGroups = groupMap->getNamesOfGroups();
637                         for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; }
638                         
639                         for (int k = 0; k < names.size(); k++) {
640                             //find out the sequences group
641                             string group = groupMap->getGroup(names[k]);
642                             
643                             if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
644                             else {
645                                 containsGroup[group] = true;
646                             }
647                         }
648                     }
649                 }
650                                 taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
651                         }
652             
653             
654             if (persample) {
655                 //divide names by group
656                 map<string, vector<string> > parsedNames;
657                 map<string, vector<string> >::iterator itParsed;
658                 
659                 //parse names by group
660                 for (int j = 0; j < names.size(); j++) {
661                     if (groupfile != "") { 
662                         string group = groupMap->getGroup(names[j]); 
663                         itParsed = parsedNames.find(group);
664                         
665                         if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
666                         else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
667                     }else { //count file was used
668                         vector<string> thisSeqsGroups = ct->getGroups(names[j]);
669                         for (int k = 0; k < thisSeqsGroups.size(); k++) {
670                             string group = thisSeqsGroups[k]; 
671                             itParsed = parsedNames.find(group);
672                             
673                             if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
674                             else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
675                         }
676                     }
677                 }
678                 
679                 for (itParsed = parsedNames.begin(); itParsed != parsedNames.end(); itParsed++) {
680                     vector<string> theseNames = findConsensusTaxonomy(itParsed->second, size, conTax);
681                     
682                     if (m->control_pressed) { break; }
683                     
684                     //output to new names file
685                     string binLabel = "Otu";
686                     string sbinNumber = toString(i+1);
687                     if (sbinNumber.length() < snumBins.length()) { 
688                         int diff = snumBins.length() - sbinNumber.length();
689                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
690                     }
691                     binLabel += sbinNumber;
692                     
693                     (*outs[groupIndex[itParsed->first]]) << binLabel << '\t' << size << '\t' << conTax << endl;
694                     
695                     string noConfidenceConTax = conTax;
696                     m->removeConfidences(noConfidenceConTax);
697                     
698                     //add this bins taxonomy to summary
699                     if (basis == "sequence") {
700                         for(int j = 0; j < theseNames.size(); j++) {  
701                             int numReps = 1;
702                             if (countfile != "") {  numReps = ct->getGroupCount(theseNames[j], itParsed->first); } //get num seqs for this seq from this group
703                             for(int k = 0; k < numReps; k++) {  (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(theseNames[j], noConfidenceConTax);  }
704                         }
705                     }else { //otu
706                         map<string, bool> containsGroup; 
707                         containsGroup[itParsed->first] = true;
708                         (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(noConfidenceConTax, containsGroup);
709                     }
710                 }
711             }
712                 }
713
714                 out.close();
715                 
716                 //print summary file
717                 taxaSum->print(outSum);
718                 outSum.close();
719         
720         if (persample) {
721             for (int i = 0; i < groups.size(); i++) {
722                 (*outs[i]).close();
723                 taxaSums[i]->print(*outSums[i]);
724                 (*outSums[i]).close();
725                 delete outs[i];
726                 delete outSums[i];
727                 delete taxaSums[i];
728             }
729         }
730                 
731                 delete taxaSum;
732                 
733                 return 0;
734
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "ClassifyOtuCommand", "process");
738                 exit(1);
739         }
740 }
741 /**************************************************************************************************/
742 string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) {
743         try{
744                 string newTax, taxon;
745                 int level = 0;
746                 
747                 //keep what you have counting the levels
748                 while (tax.find_first_of(';') != -1) {
749                         //get taxon
750                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
751                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
752                         newTax += taxon;
753                         level++;
754                 }
755                 
756                 //add "unclassified" until you reach maxLevel
757                 while (level < maxlevel) {
758                         newTax += "unclassified;";
759                         level++;
760                 }
761                 
762                 return newTax;
763         }
764         catch(exception& e) {
765                 m->errorOut(e, "ClassifyOtuCommand", "addUnclassifieds");
766                 exit(1);
767         }
768 }
769 //**********************************************************************************************************************
770
771