]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
pat's mods to seqerrorcommand
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
13 #include "sharedutilities.h"
14
15 //**********************************************************************************************************************
16 vector<string> ClassifyOtuCommand::setParameters(){     
17         try {
18                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(plist);
19                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","constaxonomy",false,true,true); parameters.push_back(ptaxonomy);
20                 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(preftaxonomy);
21         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
22         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
23                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
24         CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppersample);
25         CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
26                 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "","",false,false); parameters.push_back(pbasis);
27                 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "","",false,true); parameters.push_back(pcutoff);
28                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string ClassifyOtuCommand::getHelpString(){     
43         try {
44                 string helpString = "";
45                 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, persample, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
46                 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
47                 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
48                 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
49                 helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n";
50         helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
51                 helpString += "For example consider the following basis=sequence could give Clostridiales       3       105     16      43      46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
52                 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
53                 helpString += "Now for basis=otu could give Clostridiales       3       7       6       1       2, where 7 is the number of otus that classified to Clostridiales.\n";
54                 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
55                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
56         helpString += "The persample parameter allows you to find a consensus taxonomy for each group. Default=f\n";
57                 helpString += "The default value for label is all labels in your inputfile.\n";
58                 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
59                 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
60                 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
61                 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
62                 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
63                 return helpString;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 string ClassifyOtuCommand::getOutputPattern(string type) {
72     try {
73         string pattern = "";
74         
75         if (type == "constaxonomy") {  pattern = "[filename],[distance],cons.taxonomy"; } 
76         else if (type == "taxsummary") {  pattern = "[filename],[distance],cons.tax.summary"; } 
77         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
78         
79         return pattern;
80     }
81     catch(exception& e) {
82         m->errorOut(e, "ClassifyOtuCommand", "getOutputPattern");
83         exit(1);
84     }
85 }
86 //**********************************************************************************************************************
87 ClassifyOtuCommand::ClassifyOtuCommand(){       
88         try {
89                 abort = true; calledHelp = true; 
90                 setParameters();
91                 vector<string> tempOutNames;
92                 outputTypes["constaxonomy"] = tempOutNames;
93                 outputTypes["taxsummary"] = tempOutNames;
94         }
95         catch(exception& e) {
96                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
97                 exit(1);
98         }
99 }
100
101 //**********************************************************************************************************************
102 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
103         try{
104                 abort = false; calledHelp = false;   
105                 allLines = 1;
106                 labels.clear();
107                                 
108                 //allow user to run help
109                 if (option == "help") { 
110                         help(); abort = true; calledHelp = true;
111                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
112                 else {
113                         vector<string> myArray = setParameters();
114                         
115                         OptionParser parser(option);
116                         map<string, string> parameters = parser.getParameters();
117                         
118                         ValidParameters validParameter;
119                         map<string, string>::iterator it;
120                 
121                         //check to make sure all parameters are valid for command
122                         for (it = parameters.begin(); it != parameters.end(); it++) { 
123                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
124                         }
125                         
126                         //initialize outputTypes
127                         vector<string> tempOutNames;
128                         outputTypes["constaxonomy"] = tempOutNames;
129                         outputTypes["taxsummary"] = tempOutNames;
130                 
131                         //if the user changes the input directory command factory will send this info to us in the output parameter 
132                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
133                         if (inputDir == "not found"){   inputDir = "";          }
134                         else {
135                                 string path;
136                                 it = parameters.find("list");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
142                                 }
143                                 
144                                 it = parameters.find("name");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
150                                 }
151                                 
152                                 it = parameters.find("taxonomy");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
158                                 }
159                                 
160                                 it = parameters.find("reftaxonomy");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
166                                 }
167                                 
168                                 it = parameters.find("group");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
174                                 }
175                 
176                 it = parameters.find("count");
177                                 //user has given a template file
178                                 if(it != parameters.end()){ 
179                                         path = m->hasPath(it->second);
180                                         //if the user has not given a path then, add inputdir. else leave path alone.
181                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
182                                 }
183                         }
184
185                         
186                         //if the user changes the output directory command factory will send this info to us in the output parameter 
187                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
188                         
189                         //check for required parameters
190                         listfile = validParameter.validFile(parameters, "list", true);
191                         if (listfile == "not found") {                          
192                                 //if there is a current list file, use it
193                                 listfile = m->getListFile(); 
194                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
195                                 else {  m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
196                         }
197                         else if (listfile == "not open") { abort = true; }      
198                         else { m->setListFile(listfile); }
199                         
200                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
201                         if (taxfile == "not found") {  //if there is a current list file, use it
202                                 taxfile = m->getTaxonomyFile(); 
203                                 if (taxfile != "") {  m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
204                                 else {  m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
205                         }
206                         else if (taxfile == "not open") { abort = true; }
207                         else { m->setTaxonomyFile(taxfile); }
208                         
209                         refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
210                         if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
211                         else if (refTaxonomy == "not open") { abort = true; }
212         
213                         namefile = validParameter.validFile(parameters, "name", true);
214                         if (namefile == "not open") { namefile = ""; abort = true; }    
215                         else if (namefile == "not found") { namefile = ""; }
216                         else { m->setNameFile(namefile); }
217                         
218                         groupfile = validParameter.validFile(parameters, "group", true);
219                         if (groupfile == "not open") { abort = true; }  
220                         else if (groupfile == "not found") { groupfile = ""; }
221                         else { m->setGroupFile(groupfile); }
222             
223             countfile = validParameter.validFile(parameters, "count", true);
224                         if (countfile == "not open") { countfile = ""; abort = true; }
225                         else if (countfile == "not found") { countfile = "";  } 
226                         else { m->setCountTableFile(countfile); }
227             
228             if ((namefile != "") && (countfile != "")) {
229                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
230             }
231                         
232             if ((groupfile != "") && (countfile != "")) {
233                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
234             }
235
236                         
237                         //check for optional parameter and set defaults
238                         // ...at some point should added some additional type checking...
239                         label = validParameter.validFile(parameters, "label", false);                   
240                         if (label == "not found") { label = ""; allLines = 1;  }
241                         else { 
242                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
243                                 else { allLines = 1;  }
244                         }
245             
246                         basis = validParameter.validFile(parameters, "basis", false);
247                         if (basis == "not found") { basis = "otu"; }    
248                         
249                         if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
250                         
251                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
252                         m->mothurConvert(temp, cutoff); 
253                         
254                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
255                         probs = m->isTrue(temp);
256             
257             temp = validParameter.validFile(parameters, "persample", false);            if (temp == "not found"){       temp = "f";             }
258                         persample = m->isTrue(temp);
259                         
260             if ((groupfile == "") && (countfile == "")) { if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } 
261             }
262             if (countfile != "") {
263                 CountTable cts;
264                 if (!cts.testGroups(countfile)) { 
265                     if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; }
266                 }
267             }
268                         
269                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
270                         
271             if (countfile == "") {
272                 if (namefile == ""){
273                     vector<string> files; files.push_back(taxfile);
274                     parser.getNameFile(files);
275                 }
276             }
277                         
278                 }
279         }
280         catch(exception& e) {
281                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
282                 exit(1);
283         }
284 }
285 //**********************************************************************************************************************
286
287 int ClassifyOtuCommand::execute(){
288         try {
289         
290                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
291                 
292                 //if user gave a namesfile then use it
293                 if (namefile != "")     {       m->readNames(namefile, nameMap, true);  }
294         if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap();  groups = groupMap->getNamesOfGroups(); }
295         else { groupMap = NULL;  }
296         if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile);  if (ct->hasGroupInfo()) { groups = ct->getNamesOfGroups(); } }
297         else {  ct = NULL;    }
298         
299                 //read taxonomy file and save in map for easy access in building bin trees
300                 m->readTax(taxfile, taxMap);
301                 
302                 if (m->control_pressed) { return 0; }
303                 
304                 input = new InputData(listfile, "list");
305                 list = input->getListVector();
306                 string lastLabel = list->getLabel();
307
308                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
309                 set<string> processedLabels;
310                 set<string> userLabels = labels;
311                 
312                 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  }  return 0; }
313         
314                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
315                         
316                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
317                         
318                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
319                                         process(list);
320                                         if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; }
321                                                                                 
322                                         processedLabels.insert(list->getLabel());
323                                         userLabels.erase(list->getLabel());
324                         }
325                         
326                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
327                                         string saveLabel = list->getLabel();
328                                         
329                                         delete list;
330                                         list = input->getListVector(lastLabel);
331                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
332                                         process(list);
333                                 
334                                         
335                                         if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
336                                                                                 
337                                         processedLabels.insert(list->getLabel());
338                                         userLabels.erase(list->getLabel());
339                                         
340                                         //restore real lastlabel to save below
341                                         list->setLabel(saveLabel);
342                         }
343                         
344                         lastLabel = list->getLabel();
345         
346                         delete list;
347                         list = input->getListVector();
348                 }
349                 
350                 //output error messages about any remaining user labels
351                 bool needToRun = false;
352                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
353                         m->mothurOut("Your file does not include the label " + (*it)); 
354                         if (processedLabels.count(lastLabel) != 1) {
355                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
356                                 needToRun = true;
357                         }else {
358                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
359                         }
360                 }
361                 
362                 //run last label if you need to
363                 if (needToRun == true)  {
364                         if (list != NULL) {     delete list;    }
365                         list = input->getListVector(lastLabel);
366                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
367                         
368                         process(list);
369                         delete list;
370                         
371                         if (m->control_pressed) { outputTypes.clear();  if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
372                 }
373                 
374                 delete input;  
375         if (groupMap != NULL) { delete groupMap; }
376         if (ct != NULL) { delete ct; }
377                                 
378                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
379                 
380                 m->mothurOutEndLine();
381                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
383                 m->mothurOutEndLine();
384                 
385                 return 0;
386         }
387         catch(exception& e) {
388                 m->errorOut(e, "ClassifyOtuCommand", "execute");
389                 exit(1);
390         }
391 }
392 //**********************************************************************************************************************
393 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) {
394         try{
395                 conTax = "";
396                 vector<string> allNames;
397                 map<string, string>::iterator it;
398                 map<string, string>::iterator it2;
399
400                 //create a tree containing sequences from this bin
401                 PhyloTree* phylo = new PhyloTree();
402                 
403                 size = 0;
404                 for (int i = 0; i < names.size(); i++) {
405         
406                         //if namesfile include the names
407                         if (namefile != "") {
408         
409                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
410                                 it2 = nameMap.find(names[i]);
411                                 
412                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
413                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
414                                 }else{
415                                         
416                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
417                                         it = taxMap.find(it2->second);
418                         
419                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
420                                         
421                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
422                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
423                                         }else{
424                                 
425                                                 //add seq to tree
426                                                 phylo->addSeqToTree(names[i], it->second);
427                                                 size++;
428                                                 allNames.push_back(names[i]);
429                                         }
430                                 }
431                                 
432                         }else{
433                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
434                                 it = taxMap.find(names[i]);
435                 
436                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
437                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
438                                 }else{
439                     if (countfile != "") {
440                         int numDups = ct->getNumSeqs(names[i]); 
441                         for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree(names[i], it->second);  }
442                         size += numDups;
443                     }else{
444                                         //add seq to tree
445                         phylo->addSeqToTree(names[i], it->second);
446                         size++;  
447                     }
448                     allNames.push_back(names[i]);
449                                 }
450                         }
451
452                         
453                         if (m->control_pressed) { delete phylo; return allNames; }
454                         
455                 }
456                 
457                 //build tree
458                 phylo->assignHeirarchyIDs(0);
459                 
460                 TaxNode currentNode = phylo->get(0);
461                 int myLevel = 0;        
462                 //at each level
463                 while (currentNode.children.size() != 0) { //you still have more to explore
464                 
465                         TaxNode bestChild;
466                         int bestChildSize = 0;
467                         
468                         //go through children
469                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
470                                 
471                                 TaxNode temp = phylo->get(itChild->second);
472                                 
473                                 //select child with largest accesions - most seqs assigned to it
474                                 if (temp.accessions.size() > bestChildSize) {
475                                         bestChild = phylo->get(itChild->second);
476                                         bestChildSize = temp.accessions.size();
477                                 }
478                                 
479                         }
480             
481             //phylotree adds an extra unknown so we want to remove that
482             if (bestChild.name == "unknown") { bestChildSize--; }
483                                 
484                         //is this taxonomy above cutoff
485                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
486                         
487                         if (consensusConfidence >= cutoff) { //if yes, add it
488                                 if (probs) {
489                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
490                                 }else{
491                                         conTax += bestChild.name + ";";
492                                 }
493                                 myLevel++;
494                         }else{ //if no, quit
495                                 break;
496                         }
497                         
498                         //move down a level
499                         currentNode = bestChild;
500                 }
501                 
502                 if (myLevel != phylo->getMaxLevel()) {
503                         while (myLevel != phylo->getMaxLevel()) {
504                                 conTax += "unclassified;";
505                                 myLevel++;
506                         }
507                 }               
508                 if (conTax == "") {  conTax = "no_consensus;";  }
509                 
510                 delete phylo;   
511                 
512                 return allNames;
513                         
514         }
515         catch(exception& e) {
516                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
517                 exit(1);
518         }
519 }
520
521 //**********************************************************************************************************************
522 int ClassifyOtuCommand::process(ListVector* processList) {
523         try{
524                 string conTax;
525                 int size;
526                 
527                 //create output file
528                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
529                                 
530                 ofstream out;
531         map<string, string> variables; 
532         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
533         variables["[distance]"] = processList->getLabel();
534                 string outputFile = getOutputFileName("constaxonomy", variables);
535                 m->openOutputFile(outputFile, out);
536                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
537                 
538                 ofstream outSum;
539                 string outputSumFile = getOutputFileName("taxsummary", variables);
540                 m->openOutputFile(outputSumFile, outSum);
541                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
542                 
543                 out << "OTU\tSize\tTaxonomy" << endl;
544                 
545                 PhyloSummary* taxaSum;
546         if (countfile != "") {
547             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct);  }
548             else {  taxaSum = new PhyloSummary(ct); }
549                 }else {
550             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap);  }
551             else {  taxaSum = new PhyloSummary(groupMap); }
552         }
553         
554         vector<ofstream*> outSums;
555         vector<ofstream*> outs;
556         vector<PhyloSummary*> taxaSums;
557         map<string, int> groupIndex;
558         if (persample) {
559             for (int i = 0; i < groups.size(); i++) {
560                 groupIndex[groups[i]] = i;
561                 ofstream* temp = new ofstream();
562                 variables["[distance]"] = processList->getLabel() + "." + groups[i];
563                 string outputFile = getOutputFileName("constaxonomy", variables);
564                 m->openOutputFile(outputFile, *temp);
565                 (*temp) << "OTU\tSize\tTaxonomy" << endl;
566                 outs.push_back(temp);
567                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
568                 
569                 ofstream* tempSum = new ofstream();
570                 string outputSumFile = getOutputFileName("taxsummary", variables);
571                 m->openOutputFile(outputSumFile, *tempSum);
572                 outSums.push_back(tempSum);
573                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
574                 
575                 PhyloSummary* taxaSumt;
576                 if (countfile != "") {
577                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, ct);  }
578                     else {  taxaSumt = new PhyloSummary(ct); }
579                 }else {
580                     if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, groupMap);  }
581                     else {  taxaSumt = new PhyloSummary(groupMap); }
582                 }
583                 taxaSums.push_back(taxaSumt);
584             }
585         }
586         
587                 //for each bin in the list vector
588         string snumBins = toString(processList->getNumBins());
589                 for (int i = 0; i < processList->getNumBins(); i++) {
590                         
591                         if (m->control_pressed) { break; }
592                         
593                         vector<string> names;
594             string binnames = processList->get(i);
595             vector<string> thisNames;
596             m->splitAtComma(binnames, thisNames);
597             
598                         names = findConsensusTaxonomy(thisNames, size, conTax);
599                 
600                         if (m->control_pressed) { break; }
601                         
602                         //output to new names file
603             string binLabel = "Otu";
604             string sbinNumber = toString(i+1);
605             if (sbinNumber.length() < snumBins.length()) { 
606                 int diff = snumBins.length() - sbinNumber.length();
607                 for (int h = 0; h < diff; h++) { binLabel += "0"; }
608             }
609             binLabel += sbinNumber;
610
611                         out << binLabel << '\t' << size << '\t' << conTax << endl;
612                         
613                         string noConfidenceConTax = conTax;
614                         m->removeConfidences(noConfidenceConTax);
615                         
616                         //add this bins taxonomy to summary
617                         if (basis == "sequence") {
618                                 for(int j = 0; j < names.size(); j++) {  
619                     int numReps = 1;
620                     if (countfile != "") {  numReps = ct->getNumSeqs(names[j]); }
621                     for(int k = 0; k < numReps; k++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
622                 }
623                         }else { //otu
624                 map<string, bool> containsGroup; 
625                 if (countfile != "") {
626                     if (ct->hasGroupInfo()) {
627                         vector<string> mGroups = ct->getNamesOfGroups();
628                         for (int k = 0; k < names.size(); k++) {
629                             vector<int> counts = ct->getGroupCounts(names[k]);
630                             for (int h = 0; h < counts.size(); h++) {  
631                                 if (counts[h] != 0) {  containsGroup[mGroups[h]] = true; }
632                             }
633                         }
634                     }
635                 }else {
636                     if (groupfile != "") {
637                         vector<string> mGroups = groupMap->getNamesOfGroups();
638                         for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; }
639                         
640                         for (int k = 0; k < names.size(); k++) {
641                             //find out the sequences group
642                             string group = groupMap->getGroup(names[k]);
643                             
644                             if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
645                             else {
646                                 containsGroup[group] = true;
647                             }
648                         }
649                     }
650                 }
651                                 taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
652                         }
653             
654             
655             if (persample) {
656                 //divide names by group
657                 map<string, vector<string> > parsedNames;
658                 map<string, vector<string> >::iterator itParsed;
659                 
660                 //parse names by group
661                 for (int j = 0; j < names.size(); j++) {
662                     if (groupfile != "") { 
663                         string group = groupMap->getGroup(names[j]); 
664                         itParsed = parsedNames.find(group);
665                         
666                         if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
667                         else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
668                     }else { //count file was used
669                         vector<string> thisSeqsGroups = ct->getGroups(names[j]);
670                         for (int k = 0; k < thisSeqsGroups.size(); k++) {
671                             string group = thisSeqsGroups[k]; 
672                             itParsed = parsedNames.find(group);
673                             
674                             if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
675                             else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
676                         }
677                     }
678                 }
679                 
680                 for (itParsed = parsedNames.begin(); itParsed != parsedNames.end(); itParsed++) {
681                     vector<string> theseNames = findConsensusTaxonomy(itParsed->second, size, conTax);
682                     
683                     if (m->control_pressed) { break; }
684                     
685                     //output to new names file
686                     string binLabel = "Otu";
687                     string sbinNumber = toString(i+1);
688                     if (sbinNumber.length() < snumBins.length()) { 
689                         int diff = snumBins.length() - sbinNumber.length();
690                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
691                     }
692                     binLabel += sbinNumber;
693                     
694                     (*outs[groupIndex[itParsed->first]]) << binLabel << '\t' << size << '\t' << conTax << endl;
695                     
696                     string noConfidenceConTax = conTax;
697                     m->removeConfidences(noConfidenceConTax);
698                     
699                     //add this bins taxonomy to summary
700                     if (basis == "sequence") {
701                         for(int j = 0; j < theseNames.size(); j++) {  
702                             int numReps = 1;
703                             if (countfile != "") {  numReps = ct->getGroupCount(theseNames[j], itParsed->first); } //get num seqs for this seq from this group
704                             for(int k = 0; k < numReps; k++) {  (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(theseNames[j], noConfidenceConTax);  }
705                         }
706                     }else { //otu
707                         map<string, bool> containsGroup; 
708                         containsGroup[itParsed->first] = true;
709                         (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(noConfidenceConTax, containsGroup);
710                     }
711                 }
712             }
713                 }
714
715                 out.close();
716                 
717                 //print summary file
718                 taxaSum->print(outSum);
719                 outSum.close();
720         
721         if (persample) {
722             for (int i = 0; i < groups.size(); i++) {
723                 (*outs[i]).close();
724                 taxaSums[i]->print(*outSums[i]);
725                 (*outSums[i]).close();
726                 delete outs[i];
727                 delete outSums[i];
728                 delete taxaSums[i];
729             }
730         }
731                 
732                 delete taxaSum;
733                 
734                 return 0;
735
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "ClassifyOtuCommand", "process");
739                 exit(1);
740         }
741 }
742 /**************************************************************************************************/
743 string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) {
744         try{
745                 string newTax, taxon;
746                 int level = 0;
747                 
748                 //keep what you have counting the levels
749                 while (tax.find_first_of(';') != -1) {
750                         //get taxon
751                         taxon = tax.substr(0,tax.find_first_of(';'))+';';
752                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
753                         newTax += taxon;
754                         level++;
755                 }
756                 
757                 //add "unclassified" until you reach maxLevel
758                 while (level < maxlevel) {
759                         newTax += "unclassified;";
760                         level++;
761                 }
762                 
763                 return newTax;
764         }
765         catch(exception& e) {
766                 m->errorOut(e, "ClassifyOtuCommand", "addUnclassifieds");
767                 exit(1);
768         }
769 }
770 //**********************************************************************************************************************
771
772