2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
14 //**********************************************************************************************************************
15 vector<string> ClassifyOtuCommand::setParameters(){
17 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
18 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
19 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
20 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
22 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
23 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
24 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
25 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
26 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
39 //**********************************************************************************************************************
40 string ClassifyOtuCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n";
44 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
45 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
46 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
47 helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n";
48 helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
49 helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
50 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
51 helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n";
52 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
53 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
54 helpString += "The default value for label is all labels in your inputfile.\n";
55 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
56 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
57 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
58 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
59 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
63 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
67 //**********************************************************************************************************************
68 string ClassifyOtuCommand::getOutputFileNameTag(string type, string inputName=""){
70 string outputFileName = "";
71 map<string, vector<string> >::iterator it;
73 //is this a type this command creates
74 it = outputTypes.find(type);
75 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
77 if (type == "constaxonomy") { outputFileName = "cons.taxonomy"; }
78 else if (type == "taxsummary") { outputFileName = "cons.tax.summary"; }
79 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
81 return outputFileName;
84 m->errorOut(e, "ClassifyOtuCommand", "getOutputFileNameTag");
88 //**********************************************************************************************************************
89 ClassifyOtuCommand::ClassifyOtuCommand(){
91 abort = true; calledHelp = true;
93 vector<string> tempOutNames;
94 outputTypes["constaxonomy"] = tempOutNames;
95 outputTypes["taxsummary"] = tempOutNames;
98 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
103 //**********************************************************************************************************************
104 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
106 abort = false; calledHelp = false;
110 //allow user to run help
111 if (option == "help") {
112 help(); abort = true; calledHelp = true;
113 }else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string, string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string, string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["constaxonomy"] = tempOutNames;
131 outputTypes["taxsummary"] = tempOutNames;
133 //if the user changes the input directory command factory will send this info to us in the output parameter
134 string inputDir = validParameter.validFile(parameters, "inputdir", false);
135 if (inputDir == "not found"){ inputDir = ""; }
138 it = parameters.find("list");
139 //user has given a template file
140 if(it != parameters.end()){
141 path = m->hasPath(it->second);
142 //if the user has not given a path then, add inputdir. else leave path alone.
143 if (path == "") { parameters["list"] = inputDir + it->second; }
146 it = parameters.find("name");
147 //user has given a template file
148 if(it != parameters.end()){
149 path = m->hasPath(it->second);
150 //if the user has not given a path then, add inputdir. else leave path alone.
151 if (path == "") { parameters["name"] = inputDir + it->second; }
154 it = parameters.find("taxonomy");
155 //user has given a template file
156 if(it != parameters.end()){
157 path = m->hasPath(it->second);
158 //if the user has not given a path then, add inputdir. else leave path alone.
159 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
162 it = parameters.find("reftaxonomy");
163 //user has given a template file
164 if(it != parameters.end()){
165 path = m->hasPath(it->second);
166 //if the user has not given a path then, add inputdir. else leave path alone.
167 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
170 it = parameters.find("group");
171 //user has given a template file
172 if(it != parameters.end()){
173 path = m->hasPath(it->second);
174 //if the user has not given a path then, add inputdir. else leave path alone.
175 if (path == "") { parameters["group"] = inputDir + it->second; }
178 it = parameters.find("count");
179 //user has given a template file
180 if(it != parameters.end()){
181 path = m->hasPath(it->second);
182 //if the user has not given a path then, add inputdir. else leave path alone.
183 if (path == "") { parameters["count"] = inputDir + it->second; }
188 //if the user changes the output directory command factory will send this info to us in the output parameter
189 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
191 //check for required parameters
192 listfile = validParameter.validFile(parameters, "list", true);
193 if (listfile == "not found") {
194 //if there is a current list file, use it
195 listfile = m->getListFile();
196 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
197 else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
199 else if (listfile == "not open") { abort = true; }
200 else { m->setListFile(listfile); }
202 taxfile = validParameter.validFile(parameters, "taxonomy", true);
203 if (taxfile == "not found") { //if there is a current list file, use it
204 taxfile = m->getTaxonomyFile();
205 if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
206 else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
208 else if (taxfile == "not open") { abort = true; }
209 else { m->setTaxonomyFile(taxfile); }
211 refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
212 if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
213 else if (refTaxonomy == "not open") { abort = true; }
215 namefile = validParameter.validFile(parameters, "name", true);
216 if (namefile == "not open") { namefile = ""; abort = true; }
217 else if (namefile == "not found") { namefile = ""; }
218 else { m->setNameFile(namefile); }
220 groupfile = validParameter.validFile(parameters, "group", true);
221 if (groupfile == "not open") { abort = true; }
222 else if (groupfile == "not found") { groupfile = ""; }
223 else { m->setGroupFile(groupfile); }
225 countfile = validParameter.validFile(parameters, "count", true);
226 if (countfile == "not open") { countfile = ""; abort = true; }
227 else if (countfile == "not found") { countfile = ""; }
228 else { m->setCountTableFile(countfile); }
230 if ((namefile != "") && (countfile != "")) {
231 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
234 if ((groupfile != "") && (countfile != "")) {
235 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
239 //check for optional parameter and set defaults
240 // ...at some point should added some additional type checking...
241 label = validParameter.validFile(parameters, "label", false);
242 if (label == "not found") { label = ""; allLines = 1; }
244 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
245 else { allLines = 1; }
248 basis = validParameter.validFile(parameters, "basis", false);
249 if (basis == "not found") { basis = "otu"; }
251 if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
253 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
254 m->mothurConvert(temp, cutoff);
256 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
257 probs = m->isTrue(temp);
260 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
262 if (countfile == "") {
264 vector<string> files; files.push_back(taxfile);
265 parser.getNameFile(files);
271 catch(exception& e) {
272 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
276 //**********************************************************************************************************************
278 int ClassifyOtuCommand::execute(){
281 if (abort == true) { if (calledHelp) { return 0; } return 2; }
283 //if user gave a namesfile then use it
284 if (namefile != "") { m->readNames(namefile, nameMap, true); }
285 if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); }
286 else { groupMap = NULL; }
287 if (countfile != "") { ct = new CountTable(); ct->readTable(countfile); }
290 //read taxonomy file and save in map for easy access in building bin trees
291 m->readTax(taxfile, taxMap);
293 if (m->control_pressed) { return 0; }
295 input = new InputData(listfile, "list");
296 list = input->getListVector();
297 string lastLabel = list->getLabel();
299 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
300 set<string> processedLabels;
301 set<string> userLabels = labels;
303 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
305 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
307 if (allLines == 1 || labels.count(list->getLabel()) == 1){
309 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
311 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; }
313 processedLabels.insert(list->getLabel());
314 userLabels.erase(list->getLabel());
317 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
318 string saveLabel = list->getLabel();
321 list = input->getListVector(lastLabel);
322 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
326 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; }
328 processedLabels.insert(list->getLabel());
329 userLabels.erase(list->getLabel());
331 //restore real lastlabel to save below
332 list->setLabel(saveLabel);
335 lastLabel = list->getLabel();
338 list = input->getListVector();
341 //output error messages about any remaining user labels
342 bool needToRun = false;
343 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
344 m->mothurOut("Your file does not include the label " + (*it));
345 if (processedLabels.count(lastLabel) != 1) {
346 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
349 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
353 //run last label if you need to
354 if (needToRun == true) {
355 if (list != NULL) { delete list; }
356 list = input->getListVector(lastLabel);
357 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
362 if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; }
366 if (groupMap != NULL) { delete groupMap; }
367 if (ct != NULL) { delete ct; }
369 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
371 m->mothurOutEndLine();
372 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
373 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
374 m->mothurOutEndLine();
378 catch(exception& e) {
379 m->errorOut(e, "ClassifyOtuCommand", "execute");
383 //**********************************************************************************************************************
384 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
387 vector<string> names;
388 vector<string> allNames;
389 map<string, string>::iterator it;
390 map<string, string>::iterator it2;
392 //parse names into vector
393 string binnames = thisList->get(bin);
394 m->splitAtComma(binnames, names);
396 //create a tree containing sequences from this bin
397 PhyloTree* phylo = new PhyloTree();
400 for (int i = 0; i < names.size(); i++) {
402 //if namesfile include the names
403 if (namefile != "") {
405 //is this sequence in the name file - namemap maps seqName -> repSeqName
406 it2 = nameMap.find(names[i]);
408 if (it2 == nameMap.end()) { //this name is not in name file, skip it
409 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
412 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
413 it = taxMap.find(it2->second);
415 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
417 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
418 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
422 phylo->addSeqToTree(names[i], it->second);
424 allNames.push_back(names[i]);
429 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
430 it = taxMap.find(names[i]);
432 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
433 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
435 if (countfile != "") {
436 int numDups = ct->getNumSeqs(names[i]);
437 for (int j = 0; j < numDups; j++) { phylo->addSeqToTree(names[i], it->second); }
441 phylo->addSeqToTree(names[i], it->second);
444 allNames.push_back(names[i]);
449 if (m->control_pressed) { delete phylo; return allNames; }
454 phylo->assignHeirarchyIDs(0);
456 TaxNode currentNode = phylo->get(0);
459 while (currentNode.children.size() != 0) { //you still have more to explore
462 int bestChildSize = 0;
464 //go through children
465 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
467 TaxNode temp = phylo->get(itChild->second);
469 //select child with largest accesions - most seqs assigned to it
470 if (temp.accessions.size() > bestChildSize) {
471 bestChild = phylo->get(itChild->second);
472 bestChildSize = temp.accessions.size();
477 //phylotree adds an extra unknown so we want to remove that
478 if (bestChild.name == "unknown") { bestChildSize--; }
480 //is this taxonomy above cutoff
481 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
483 if (consensusConfidence >= cutoff) { //if yes, add it
485 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
487 conTax += bestChild.name + ";";
495 currentNode = bestChild;
498 if (myLevel != phylo->getMaxLevel()) {
499 while (myLevel != phylo->getMaxLevel()) {
500 conTax += "unclassified;";
504 if (conTax == "") { conTax = "no_consensus;"; }
511 catch(exception& e) {
512 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
517 //**********************************************************************************************************************
518 int ClassifyOtuCommand::process(ListVector* processList) {
524 if (outputDir == "") { outputDir += m->hasPath(listfile); }
527 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy");
528 m->openOutputFile(outputFile, out);
529 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
532 string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary");
533 m->openOutputFile(outputSumFile, outSum);
534 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
536 out << "OTU\tSize\tTaxonomy" << endl;
538 PhyloSummary* taxaSum;
539 if (countfile != "") {
540 if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); }
541 else { taxaSum = new PhyloSummary(ct); }
543 if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); }
544 else { taxaSum = new PhyloSummary(groupMap); }
547 //for each bin in the list vector
548 string snumBins = toString(processList->getNumBins());
549 for (int i = 0; i < processList->getNumBins(); i++) {
551 if (m->control_pressed) { break; }
553 vector<string> names;
554 names = findConsensusTaxonomy(i, processList, size, conTax);
556 if (m->control_pressed) { out.close(); return 0; }
558 //output to new names file
559 string binLabel = "Otu";
560 string sbinNumber = toString(i+1);
561 if (sbinNumber.length() < snumBins.length()) {
562 int diff = snumBins.length() - sbinNumber.length();
563 for (int h = 0; h < diff; h++) { binLabel += "0"; }
565 binLabel += sbinNumber;
567 out << binLabel << '\t' << size << '\t' << conTax << endl;
569 string noConfidenceConTax = conTax;
570 m->removeConfidences(noConfidenceConTax);
572 //add this bins taxonomy to summary
573 if (basis == "sequence") {
574 for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); }
576 map<string, bool> containsGroup;
577 if (countfile != "") {
578 if (ct->hasGroupInfo()) {
579 vector<string> mGroups = ct->getNamesOfGroups();
580 for (int k = 0; k < names.size(); k++) {
581 vector<int> counts = ct->getGroupCounts(names[k]);
582 for (int h = 0; h < counts.size(); h++) {
583 if (counts[h] != 0) { containsGroup[mGroups[h]] = true; }
588 if (groupfile != "") {
589 vector<string> mGroups = groupMap->getNamesOfGroups();
590 for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; }
592 for (int k = 0; k < names.size(); k++) {
593 //find out the sequences group
594 string group = groupMap->getGroup(names[k]);
596 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
598 containsGroup[group] = true;
603 taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
610 taxaSum->print(outSum);
618 catch(exception& e) {
619 m->errorOut(e, "ClassifyOtuCommand", "process");
623 /**************************************************************************************************/
624 string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) {
626 string newTax, taxon;
629 //keep what you have counting the levels
630 while (tax.find_first_of(';') != -1) {
632 taxon = tax.substr(0,tax.find_first_of(';'))+';';
633 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
638 //add "unclassified" until you reach maxLevel
639 while (level < maxlevel) {
640 newTax += "unclassified;";
646 catch(exception& e) {
647 m->errorOut(e, "ClassifyOtuCommand", "addUnclassifieds");
651 //**********************************************************************************************************************