2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
14 //**********************************************************************************************************************
15 vector<string> ClassifyOtuCommand::setParameters(){
17 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
18 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
19 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
22 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
23 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
24 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
25 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38 //**********************************************************************************************************************
39 string ClassifyOtuCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n";
43 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
44 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
45 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
46 helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
47 helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
48 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
49 helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n";
50 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
51 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
52 helpString += "The default value for label is all labels in your inputfile.\n";
53 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
54 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
55 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
56 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
57 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
61 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
65 //**********************************************************************************************************************
66 ClassifyOtuCommand::ClassifyOtuCommand(){
68 abort = true; calledHelp = true;
70 vector<string> tempOutNames;
71 outputTypes["constaxonomy"] = tempOutNames;
72 outputTypes["taxsummary"] = tempOutNames;
75 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
80 //**********************************************************************************************************************
81 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
83 abort = false; calledHelp = false;
87 //allow user to run help
88 if (option == "help") {
89 help(); abort = true; calledHelp = true;
91 vector<string> myArray = setParameters();
93 OptionParser parser(option);
94 map<string, string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string, string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["constaxonomy"] = tempOutNames;
107 outputTypes["taxsummary"] = tempOutNames;
109 //if the user changes the input directory command factory will send this info to us in the output parameter
110 string inputDir = validParameter.validFile(parameters, "inputdir", false);
111 if (inputDir == "not found"){ inputDir = ""; }
114 it = parameters.find("list");
115 //user has given a template file
116 if(it != parameters.end()){
117 path = m->hasPath(it->second);
118 //if the user has not given a path then, add inputdir. else leave path alone.
119 if (path == "") { parameters["list"] = inputDir + it->second; }
122 it = parameters.find("name");
123 //user has given a template file
124 if(it != parameters.end()){
125 path = m->hasPath(it->second);
126 //if the user has not given a path then, add inputdir. else leave path alone.
127 if (path == "") { parameters["name"] = inputDir + it->second; }
130 it = parameters.find("taxonomy");
131 //user has given a template file
132 if(it != parameters.end()){
133 path = m->hasPath(it->second);
134 //if the user has not given a path then, add inputdir. else leave path alone.
135 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
138 it = parameters.find("reftaxonomy");
139 //user has given a template file
140 if(it != parameters.end()){
141 path = m->hasPath(it->second);
142 //if the user has not given a path then, add inputdir. else leave path alone.
143 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
146 it = parameters.find("group");
147 //user has given a template file
148 if(it != parameters.end()){
149 path = m->hasPath(it->second);
150 //if the user has not given a path then, add inputdir. else leave path alone.
151 if (path == "") { parameters["group"] = inputDir + it->second; }
156 //if the user changes the output directory command factory will send this info to us in the output parameter
157 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
159 //check for required parameters
160 listfile = validParameter.validFile(parameters, "list", true);
161 if (listfile == "not found") {
162 //if there is a current list file, use it
163 listfile = m->getListFile();
164 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
165 else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
167 else if (listfile == "not open") { abort = true; }
169 taxfile = validParameter.validFile(parameters, "taxonomy", true);
170 if (taxfile == "not found") { //if there is a current list file, use it
171 taxfile = m->getTaxonomyFile();
172 if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
173 else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
175 else if (taxfile == "not open") { abort = true; }
177 refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
178 if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
179 else if (refTaxonomy == "not open") { abort = true; }
181 namefile = validParameter.validFile(parameters, "name", true);
182 if (namefile == "not open") { abort = true; }
183 else if (namefile == "not found") { namefile = ""; }
185 groupfile = validParameter.validFile(parameters, "group", true);
186 if (groupfile == "not open") { abort = true; }
187 else if (groupfile == "not found") { groupfile = ""; }
189 //check for optional parameter and set defaults
190 // ...at some point should added some additional type checking...
191 label = validParameter.validFile(parameters, "label", false);
192 if (label == "not found") { label = ""; allLines = 1; }
194 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
195 else { allLines = 1; }
198 basis = validParameter.validFile(parameters, "basis", false);
199 if (basis == "not found") { basis = "otu"; }
201 if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
203 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
204 convert(temp, cutoff);
206 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
207 probs = m->isTrue(temp);
210 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
214 catch(exception& e) {
215 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
219 //**********************************************************************************************************************
221 int ClassifyOtuCommand::execute(){
224 if (abort == true) { if (calledHelp) { return 0; } return 2; }
226 //if user gave a namesfile then use it
227 if (namefile != "") { readNamesFile(); }
229 //read taxonomy file and save in map for easy access in building bin trees
232 if (m->control_pressed) { return 0; }
234 input = new InputData(listfile, "list");
235 list = input->getListVector();
236 string lastLabel = list->getLabel();
238 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
239 set<string> processedLabels;
240 set<string> userLabels = labels;
242 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
244 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
246 if (allLines == 1 || labels.count(list->getLabel()) == 1){
248 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
250 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
252 processedLabels.insert(list->getLabel());
253 userLabels.erase(list->getLabel());
256 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
257 string saveLabel = list->getLabel();
260 list = input->getListVector(lastLabel);
261 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
265 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
267 processedLabels.insert(list->getLabel());
268 userLabels.erase(list->getLabel());
270 //restore real lastlabel to save below
271 list->setLabel(saveLabel);
274 lastLabel = list->getLabel();
277 list = input->getListVector();
280 //output error messages about any remaining user labels
281 bool needToRun = false;
282 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
283 m->mothurOut("Your file does not include the label " + (*it));
284 if (processedLabels.count(lastLabel) != 1) {
285 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
288 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
292 //run last label if you need to
293 if (needToRun == true) {
294 if (list != NULL) { delete list; }
295 list = input->getListVector(lastLabel);
296 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
301 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
306 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
308 m->mothurOutEndLine();
309 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
310 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
311 m->mothurOutEndLine();
315 catch(exception& e) {
316 m->errorOut(e, "ClassifyOtuCommand", "execute");
321 //**********************************************************************************************************************
322 int ClassifyOtuCommand::readNamesFile() {
326 m->openInputFile(namefile, inNames);
330 while(!inNames.eof()){
331 inNames >> name; //read from first column A
332 inNames >> names; //read from second column A,B,C,D
335 //parse names into vector
336 vector<string> theseNames;
337 m->splitAtComma(names, theseNames);
339 for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
341 if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
347 catch(exception& e) {
348 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
352 //**********************************************************************************************************************
353 int ClassifyOtuCommand::readTaxonomyFile() {
357 m->openInputFile(taxfile, in);
365 //are there confidence scores, if so remove them
366 if (tax.find_first_of('(') != -1) { removeConfidences(tax); }
370 if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
376 catch(exception& e) {
377 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
381 //**********************************************************************************************************************
382 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
385 vector<string> names;
386 vector<string> allNames;
387 map<string, string>::iterator it;
388 map<string, string>::iterator it2;
390 //parse names into vector
391 string binnames = thisList->get(bin);
392 m->splitAtComma(binnames, names);
394 //create a tree containing sequences from this bin
395 PhyloTree* phylo = new PhyloTree();
398 for (int i = 0; i < names.size(); i++) {
400 //if namesfile include the names
401 if (namefile != "") {
403 //is this sequence in the name file - namemap maps seqName -> repSeqName
404 it2 = nameMap.find(names[i]);
406 if (it2 == nameMap.end()) { //this name is not in name file, skip it
407 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
410 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
411 it = taxMap.find(it2->second);
413 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
415 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
416 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
420 phylo->addSeqToTree(names[i], it->second);
422 allNames.push_back(names[i]);
427 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
428 it = taxMap.find(names[i]);
430 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
431 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
434 phylo->addSeqToTree(names[i], it->second);
436 allNames.push_back(names[i]);
441 if (m->control_pressed) { delete phylo; return allNames; }
446 phylo->assignHeirarchyIDs(0);
448 TaxNode currentNode = phylo->get(0);
451 while (currentNode.children.size() != 0) { //you still have more to explore
454 int bestChildSize = 0;
456 //go through children
457 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
459 TaxNode temp = phylo->get(itChild->second);
461 //select child with largest accesions - most seqs assigned to it
462 if (temp.accessions.size() > bestChildSize) {
463 bestChild = phylo->get(itChild->second);
464 bestChildSize = temp.accessions.size();
469 //is this taxonomy above cutoff
470 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
472 if (consensusConfidence >= cutoff) { //if yes, add it
474 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
476 conTax += bestChild.name + ";";
483 currentNode = bestChild;
487 if (conTax == "") { conTax = "no_consensus;"; }
494 catch(exception& e) {
495 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
500 //**********************************************************************************************************************
501 int ClassifyOtuCommand::process(ListVector* processList) {
507 if (outputDir == "") { outputDir += m->hasPath(listfile); }
510 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
511 m->openOutputFile(outputFile, out);
512 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
515 string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.tax.summary";
516 m->openOutputFile(outputSumFile, outSum);
517 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
519 out << "OTU\tSize\tTaxonomy" << endl;
521 PhyloSummary* taxaSum;
522 if (refTaxonomy != "") {
523 taxaSum = new PhyloSummary(refTaxonomy, groupfile);
525 taxaSum = new PhyloSummary(groupfile);
528 //for each bin in the list vector
529 for (int i = 0; i < processList->getNumBins(); i++) {
531 if (m->control_pressed) { break; }
533 vector<string> names;
534 names = findConsensusTaxonomy(i, processList, size, conTax);
536 if (m->control_pressed) { out.close(); return 0; }
538 //output to new names file
539 out << (i+1) << '\t' << size << '\t' << conTax << endl;
541 string noConfidenceConTax = conTax;
542 removeConfidences(noConfidenceConTax);
544 //add this bins taxonomy to summary
545 if (basis == "sequence") {
546 for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); }
548 taxaSum->addSeqToTree(noConfidenceConTax, names);
555 taxaSum->print(outSum);
563 catch(exception& e) {
564 m->errorOut(e, "ClassifyOtuCommand", "process");
568 /**************************************************************************************************/
569 void ClassifyOtuCommand::removeConfidences(string& tax) {
575 while (tax.find_first_of(';') != -1) {
577 taxon = tax.substr(0,tax.find_first_of(';'));
579 int pos = taxon.find_first_of('(');
581 taxon = taxon.substr(0, pos); //rip off confidence
586 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
592 catch(exception& e) {
593 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
597 //**********************************************************************************************************************