2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
14 //**********************************************************************************************************************
15 vector<string> ClassifyOtuCommand::setParameters(){
17 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
18 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
19 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
22 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
23 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
24 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
25 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38 //**********************************************************************************************************************
39 string ClassifyOtuCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n";
43 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
44 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
45 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
46 helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
47 helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
48 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
49 helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n";
50 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
51 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
52 helpString += "The default value for label is all labels in your inputfile.\n";
53 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
54 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
55 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
56 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
57 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
61 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
65 //**********************************************************************************************************************
66 string ClassifyOtuCommand::getOutputFileNameTag(string type, string inputName=""){
68 string outputFileName = "";
69 map<string, vector<string> >::iterator it;
71 //is this a type this command creates
72 it = outputTypes.find(type);
73 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
75 if (type == "constaxonomy") { outputFileName = "cons.taxonomy"; }
76 else if (type == "taxsummary") { outputFileName = "cons.tax.summary"; }
77 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
79 return outputFileName;
82 m->errorOut(e, "ClassifyOtuCommand", "getOutputFileNameTag");
86 //**********************************************************************************************************************
87 ClassifyOtuCommand::ClassifyOtuCommand(){
89 abort = true; calledHelp = true;
91 vector<string> tempOutNames;
92 outputTypes["constaxonomy"] = tempOutNames;
93 outputTypes["taxsummary"] = tempOutNames;
96 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
101 //**********************************************************************************************************************
102 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
104 abort = false; calledHelp = false;
108 //allow user to run help
109 if (option == "help") {
110 help(); abort = true; calledHelp = true;
111 }else if(option == "citation") { citation(); abort = true; calledHelp = true;}
113 vector<string> myArray = setParameters();
115 OptionParser parser(option);
116 map<string, string> parameters = parser.getParameters();
118 ValidParameters validParameter;
119 map<string, string>::iterator it;
121 //check to make sure all parameters are valid for command
122 for (it = parameters.begin(); it != parameters.end(); it++) {
123 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
126 //initialize outputTypes
127 vector<string> tempOutNames;
128 outputTypes["constaxonomy"] = tempOutNames;
129 outputTypes["taxsummary"] = tempOutNames;
131 //if the user changes the input directory command factory will send this info to us in the output parameter
132 string inputDir = validParameter.validFile(parameters, "inputdir", false);
133 if (inputDir == "not found"){ inputDir = ""; }
136 it = parameters.find("list");
137 //user has given a template file
138 if(it != parameters.end()){
139 path = m->hasPath(it->second);
140 //if the user has not given a path then, add inputdir. else leave path alone.
141 if (path == "") { parameters["list"] = inputDir + it->second; }
144 it = parameters.find("name");
145 //user has given a template file
146 if(it != parameters.end()){
147 path = m->hasPath(it->second);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { parameters["name"] = inputDir + it->second; }
152 it = parameters.find("taxonomy");
153 //user has given a template file
154 if(it != parameters.end()){
155 path = m->hasPath(it->second);
156 //if the user has not given a path then, add inputdir. else leave path alone.
157 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
160 it = parameters.find("reftaxonomy");
161 //user has given a template file
162 if(it != parameters.end()){
163 path = m->hasPath(it->second);
164 //if the user has not given a path then, add inputdir. else leave path alone.
165 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
168 it = parameters.find("group");
169 //user has given a template file
170 if(it != parameters.end()){
171 path = m->hasPath(it->second);
172 //if the user has not given a path then, add inputdir. else leave path alone.
173 if (path == "") { parameters["group"] = inputDir + it->second; }
178 //if the user changes the output directory command factory will send this info to us in the output parameter
179 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
181 //check for required parameters
182 listfile = validParameter.validFile(parameters, "list", true);
183 if (listfile == "not found") {
184 //if there is a current list file, use it
185 listfile = m->getListFile();
186 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
187 else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
189 else if (listfile == "not open") { abort = true; }
190 else { m->setListFile(listfile); }
192 taxfile = validParameter.validFile(parameters, "taxonomy", true);
193 if (taxfile == "not found") { //if there is a current list file, use it
194 taxfile = m->getTaxonomyFile();
195 if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
196 else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
198 else if (taxfile == "not open") { abort = true; }
199 else { m->setTaxonomyFile(taxfile); }
201 refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
202 if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
203 else if (refTaxonomy == "not open") { abort = true; }
205 namefile = validParameter.validFile(parameters, "name", true);
206 if (namefile == "not open") { namefile = ""; abort = true; }
207 else if (namefile == "not found") { namefile = ""; }
208 else { m->setNameFile(namefile); }
210 groupfile = validParameter.validFile(parameters, "group", true);
211 if (groupfile == "not open") { abort = true; }
212 else if (groupfile == "not found") { groupfile = ""; }
213 else { m->setGroupFile(groupfile); }
215 //check for optional parameter and set defaults
216 // ...at some point should added some additional type checking...
217 label = validParameter.validFile(parameters, "label", false);
218 if (label == "not found") { label = ""; allLines = 1; }
220 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
221 else { allLines = 1; }
224 basis = validParameter.validFile(parameters, "basis", false);
225 if (basis == "not found") { basis = "otu"; }
227 if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
229 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
230 m->mothurConvert(temp, cutoff);
232 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
233 probs = m->isTrue(temp);
236 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
239 vector<string> files; files.push_back(taxfile);
240 parser.getNameFile(files);
245 catch(exception& e) {
246 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
250 //**********************************************************************************************************************
252 int ClassifyOtuCommand::execute(){
255 if (abort == true) { if (calledHelp) { return 0; } return 2; }
257 //if user gave a namesfile then use it
258 if (namefile != "") { m->readNames(namefile, nameMap, true); }
260 //read taxonomy file and save in map for easy access in building bin trees
261 m->readTax(taxfile, taxMap);
263 if (m->control_pressed) { return 0; }
265 input = new InputData(listfile, "list");
266 list = input->getListVector();
267 string lastLabel = list->getLabel();
269 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
270 set<string> processedLabels;
271 set<string> userLabels = labels;
273 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
275 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
277 if (allLines == 1 || labels.count(list->getLabel()) == 1){
279 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
281 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; }
283 processedLabels.insert(list->getLabel());
284 userLabels.erase(list->getLabel());
287 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
288 string saveLabel = list->getLabel();
291 list = input->getListVector(lastLabel);
292 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
296 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; }
298 processedLabels.insert(list->getLabel());
299 userLabels.erase(list->getLabel());
301 //restore real lastlabel to save below
302 list->setLabel(saveLabel);
305 lastLabel = list->getLabel();
308 list = input->getListVector();
311 //output error messages about any remaining user labels
312 bool needToRun = false;
313 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
314 m->mothurOut("Your file does not include the label " + (*it));
315 if (processedLabels.count(lastLabel) != 1) {
316 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
319 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
323 //run last label if you need to
324 if (needToRun == true) {
325 if (list != NULL) { delete list; }
326 list = input->getListVector(lastLabel);
327 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
332 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; }
337 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
339 m->mothurOutEndLine();
340 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
341 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
342 m->mothurOutEndLine();
346 catch(exception& e) {
347 m->errorOut(e, "ClassifyOtuCommand", "execute");
351 //**********************************************************************************************************************
352 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
355 vector<string> names;
356 vector<string> allNames;
357 map<string, string>::iterator it;
358 map<string, string>::iterator it2;
360 //parse names into vector
361 string binnames = thisList->get(bin);
362 m->splitAtComma(binnames, names);
364 //create a tree containing sequences from this bin
365 PhyloTree* phylo = new PhyloTree();
368 for (int i = 0; i < names.size(); i++) {
370 //if namesfile include the names
371 if (namefile != "") {
373 //is this sequence in the name file - namemap maps seqName -> repSeqName
374 it2 = nameMap.find(names[i]);
376 if (it2 == nameMap.end()) { //this name is not in name file, skip it
377 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
380 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
381 it = taxMap.find(it2->second);
383 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
385 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
386 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
390 phylo->addSeqToTree(names[i], it->second);
392 allNames.push_back(names[i]);
397 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
398 it = taxMap.find(names[i]);
400 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
401 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
404 phylo->addSeqToTree(names[i], it->second);
406 allNames.push_back(names[i]);
411 if (m->control_pressed) { delete phylo; return allNames; }
416 phylo->assignHeirarchyIDs(0);
418 TaxNode currentNode = phylo->get(0);
421 while (currentNode.children.size() != 0) { //you still have more to explore
424 int bestChildSize = 0;
426 //go through children
427 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
429 TaxNode temp = phylo->get(itChild->second);
431 //select child with largest accesions - most seqs assigned to it
432 if (temp.accessions.size() > bestChildSize) {
433 bestChild = phylo->get(itChild->second);
434 bestChildSize = temp.accessions.size();
439 //phylotree adds an extra unknown so we want to remove that
440 if (bestChild.name == "unknown") { bestChildSize--; }
442 //is this taxonomy above cutoff
443 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
445 if (consensusConfidence >= cutoff) { //if yes, add it
447 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
449 conTax += bestChild.name + ";";
457 currentNode = bestChild;
460 if (myLevel != phylo->getMaxLevel()) {
461 while (myLevel != phylo->getMaxLevel()) {
462 conTax += "unclassified;";
466 if (conTax == "") { conTax = "no_consensus;"; }
473 catch(exception& e) {
474 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
479 //**********************************************************************************************************************
480 int ClassifyOtuCommand::process(ListVector* processList) {
486 if (outputDir == "") { outputDir += m->hasPath(listfile); }
489 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("constaxonomy");
490 m->openOutputFile(outputFile, out);
491 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
494 string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("taxsummary");
495 m->openOutputFile(outputSumFile, outSum);
496 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
498 out << "OTU\tSize\tTaxonomy" << endl;
500 PhyloSummary* taxaSum;
501 if (refTaxonomy != "") {
502 taxaSum = new PhyloSummary(refTaxonomy, groupfile);
504 taxaSum = new PhyloSummary(groupfile);
508 //for each bin in the list vector
509 string snumBins = toString(processList->getNumBins());
510 for (int i = 0; i < processList->getNumBins(); i++) {
512 if (m->control_pressed) { break; }
514 vector<string> names;
515 names = findConsensusTaxonomy(i, processList, size, conTax);
517 if (m->control_pressed) { out.close(); return 0; }
519 //output to new names file
520 string binLabel = "Otu";
521 string sbinNumber = toString(i+1);
522 if (sbinNumber.length() < snumBins.length()) {
523 int diff = snumBins.length() - sbinNumber.length();
524 for (int h = 0; h < diff; h++) { binLabel += "0"; }
526 binLabel += sbinNumber;
528 out << binLabel << '\t' << size << '\t' << conTax << endl;
530 string noConfidenceConTax = conTax;
531 m->removeConfidences(noConfidenceConTax);
533 //add this bins taxonomy to summary
534 if (basis == "sequence") {
535 for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); }
537 taxaSum->addSeqToTree(noConfidenceConTax, names);
544 taxaSum->print(outSum);
552 catch(exception& e) {
553 m->errorOut(e, "ClassifyOtuCommand", "process");
557 /**************************************************************************************************/
558 string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) {
560 string newTax, taxon;
563 //keep what you have counting the levels
564 while (tax.find_first_of(';') != -1) {
566 taxon = tax.substr(0,tax.find_first_of(';'))+';';
567 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
572 //add "unclassified" until you reach maxLevel
573 while (level < maxlevel) {
574 newTax += "unclassified;";
580 catch(exception& e) {
581 m->errorOut(e, "ClassifyOtuCommand", "addUnclassifieds");
585 //**********************************************************************************************************************