2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
14 //**********************************************************************************************************************
15 vector<string> ClassifyOtuCommand::setParameters(){
17 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
18 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
19 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
22 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
23 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
24 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
25 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "ClassifyOtuCommand", "setParameters");
38 //**********************************************************************************************************************
39 string ClassifyOtuCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n";
43 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
44 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
45 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
46 helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
47 helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
48 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
49 helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n";
50 helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
51 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
52 helpString += "The default value for label is all labels in your inputfile.\n";
53 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
54 helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
55 helpString += "The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n";
56 helpString += "Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n";
57 helpString += "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n";
61 m->errorOut(e, "ClassifyOtuCommand", "getHelpString");
65 //**********************************************************************************************************************
66 ClassifyOtuCommand::ClassifyOtuCommand(){
68 abort = true; calledHelp = true;
70 vector<string> tempOutNames;
71 outputTypes["constaxonomy"] = tempOutNames;
72 outputTypes["taxsummary"] = tempOutNames;
75 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
80 //**********************************************************************************************************************
81 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
83 abort = false; calledHelp = false;
87 //allow user to run help
88 if (option == "help") {
89 help(); abort = true; calledHelp = true;
90 }else if(option == "citation") { citation(); abort = true; calledHelp = true;}
92 vector<string> myArray = setParameters();
94 OptionParser parser(option);
95 map<string, string> parameters = parser.getParameters();
97 ValidParameters validParameter;
98 map<string, string>::iterator it;
100 //check to make sure all parameters are valid for command
101 for (it = parameters.begin(); it != parameters.end(); it++) {
102 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
105 //initialize outputTypes
106 vector<string> tempOutNames;
107 outputTypes["constaxonomy"] = tempOutNames;
108 outputTypes["taxsummary"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("list");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["list"] = inputDir + it->second; }
123 it = parameters.find("name");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["name"] = inputDir + it->second; }
131 it = parameters.find("taxonomy");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
139 it = parameters.find("reftaxonomy");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
147 it = parameters.find("group");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["group"] = inputDir + it->second; }
157 //if the user changes the output directory command factory will send this info to us in the output parameter
158 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
160 //check for required parameters
161 listfile = validParameter.validFile(parameters, "list", true);
162 if (listfile == "not found") {
163 //if there is a current list file, use it
164 listfile = m->getListFile();
165 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
166 else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
168 else if (listfile == "not open") { abort = true; }
170 taxfile = validParameter.validFile(parameters, "taxonomy", true);
171 if (taxfile == "not found") { //if there is a current list file, use it
172 taxfile = m->getTaxonomyFile();
173 if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
174 else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
176 else if (taxfile == "not open") { abort = true; }
178 refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
179 if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
180 else if (refTaxonomy == "not open") { abort = true; }
182 namefile = validParameter.validFile(parameters, "name", true);
183 if (namefile == "not open") { abort = true; }
184 else if (namefile == "not found") { namefile = ""; }
186 groupfile = validParameter.validFile(parameters, "group", true);
187 if (groupfile == "not open") { abort = true; }
188 else if (groupfile == "not found") { groupfile = ""; }
190 //check for optional parameter and set defaults
191 // ...at some point should added some additional type checking...
192 label = validParameter.validFile(parameters, "label", false);
193 if (label == "not found") { label = ""; allLines = 1; }
195 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
196 else { allLines = 1; }
199 basis = validParameter.validFile(parameters, "basis", false);
200 if (basis == "not found") { basis = "otu"; }
202 if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
204 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
205 convert(temp, cutoff);
207 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
208 probs = m->isTrue(temp);
211 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
215 catch(exception& e) {
216 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
220 //**********************************************************************************************************************
222 int ClassifyOtuCommand::execute(){
225 if (abort == true) { if (calledHelp) { return 0; } return 2; }
227 //if user gave a namesfile then use it
228 if (namefile != "") { readNamesFile(); }
230 //read taxonomy file and save in map for easy access in building bin trees
233 if (m->control_pressed) { return 0; }
235 input = new InputData(listfile, "list");
236 list = input->getListVector();
237 string lastLabel = list->getLabel();
239 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
240 set<string> processedLabels;
241 set<string> userLabels = labels;
243 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
245 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
247 if (allLines == 1 || labels.count(list->getLabel()) == 1){
249 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
251 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
253 processedLabels.insert(list->getLabel());
254 userLabels.erase(list->getLabel());
257 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
258 string saveLabel = list->getLabel();
261 list = input->getListVector(lastLabel);
262 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
266 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
268 processedLabels.insert(list->getLabel());
269 userLabels.erase(list->getLabel());
271 //restore real lastlabel to save below
272 list->setLabel(saveLabel);
275 lastLabel = list->getLabel();
278 list = input->getListVector();
281 //output error messages about any remaining user labels
282 bool needToRun = false;
283 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
284 m->mothurOut("Your file does not include the label " + (*it));
285 if (processedLabels.count(lastLabel) != 1) {
286 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
289 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
293 //run last label if you need to
294 if (needToRun == true) {
295 if (list != NULL) { delete list; }
296 list = input->getListVector(lastLabel);
297 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
302 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
307 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
309 m->mothurOutEndLine();
310 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
311 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
312 m->mothurOutEndLine();
316 catch(exception& e) {
317 m->errorOut(e, "ClassifyOtuCommand", "execute");
322 //**********************************************************************************************************************
323 int ClassifyOtuCommand::readNamesFile() {
327 m->openInputFile(namefile, inNames);
331 while(!inNames.eof()){
332 inNames >> name; //read from first column A
333 inNames >> names; //read from second column A,B,C,D
336 //parse names into vector
337 vector<string> theseNames;
338 m->splitAtComma(names, theseNames);
340 for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
342 if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
348 catch(exception& e) {
349 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
353 //**********************************************************************************************************************
354 int ClassifyOtuCommand::readTaxonomyFile() {
358 m->openInputFile(taxfile, in);
366 //are there confidence scores, if so remove them
367 if (tax.find_first_of('(') != -1) { removeConfidences(tax); }
371 if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
377 catch(exception& e) {
378 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
382 //**********************************************************************************************************************
383 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
386 vector<string> names;
387 vector<string> allNames;
388 map<string, string>::iterator it;
389 map<string, string>::iterator it2;
391 //parse names into vector
392 string binnames = thisList->get(bin);
393 m->splitAtComma(binnames, names);
395 //create a tree containing sequences from this bin
396 PhyloTree* phylo = new PhyloTree();
399 for (int i = 0; i < names.size(); i++) {
401 //if namesfile include the names
402 if (namefile != "") {
404 //is this sequence in the name file - namemap maps seqName -> repSeqName
405 it2 = nameMap.find(names[i]);
407 if (it2 == nameMap.end()) { //this name is not in name file, skip it
408 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
411 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
412 it = taxMap.find(it2->second);
414 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
416 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
417 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
421 phylo->addSeqToTree(names[i], it->second);
423 allNames.push_back(names[i]);
428 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
429 it = taxMap.find(names[i]);
431 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
432 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
435 phylo->addSeqToTree(names[i], it->second);
437 allNames.push_back(names[i]);
442 if (m->control_pressed) { delete phylo; return allNames; }
447 phylo->assignHeirarchyIDs(0);
449 TaxNode currentNode = phylo->get(0);
452 while (currentNode.children.size() != 0) { //you still have more to explore
455 int bestChildSize = 0;
457 //go through children
458 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
460 TaxNode temp = phylo->get(itChild->second);
462 //select child with largest accesions - most seqs assigned to it
463 if (temp.accessions.size() > bestChildSize) {
464 bestChild = phylo->get(itChild->second);
465 bestChildSize = temp.accessions.size();
470 //is this taxonomy above cutoff
471 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
473 if (consensusConfidence >= cutoff) { //if yes, add it
475 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
477 conTax += bestChild.name + ";";
484 currentNode = bestChild;
488 if (conTax == "") { conTax = "no_consensus;"; }
495 catch(exception& e) {
496 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
501 //**********************************************************************************************************************
502 int ClassifyOtuCommand::process(ListVector* processList) {
508 if (outputDir == "") { outputDir += m->hasPath(listfile); }
511 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
512 m->openOutputFile(outputFile, out);
513 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
516 string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.tax.summary";
517 m->openOutputFile(outputSumFile, outSum);
518 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
520 out << "OTU\tSize\tTaxonomy" << endl;
522 PhyloSummary* taxaSum;
523 if (refTaxonomy != "") {
524 taxaSum = new PhyloSummary(refTaxonomy, groupfile);
526 taxaSum = new PhyloSummary(groupfile);
529 //for each bin in the list vector
530 for (int i = 0; i < processList->getNumBins(); i++) {
532 if (m->control_pressed) { break; }
534 vector<string> names;
535 names = findConsensusTaxonomy(i, processList, size, conTax);
537 if (m->control_pressed) { out.close(); return 0; }
539 //output to new names file
540 out << (i+1) << '\t' << size << '\t' << conTax << endl;
542 string noConfidenceConTax = conTax;
543 removeConfidences(noConfidenceConTax);
545 //add this bins taxonomy to summary
546 if (basis == "sequence") {
547 for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); }
549 taxaSum->addSeqToTree(noConfidenceConTax, names);
556 taxaSum->print(outSum);
564 catch(exception& e) {
565 m->errorOut(e, "ClassifyOtuCommand", "process");
569 /**************************************************************************************************/
570 void ClassifyOtuCommand::removeConfidences(string& tax) {
576 while (tax.find_first_of(';') != -1) {
578 taxon = tax.substr(0,tax.find_first_of(';'));
580 int pos = taxon.find_first_of('(');
582 taxon = taxon.substr(0, pos); //rip off confidence
587 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
593 catch(exception& e) {
594 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
598 //**********************************************************************************************************************