2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
13 //**********************************************************************************************************************
14 vector<string> ClassifyOtuCommand::getValidParameters(){
16 string AlignArray[] = {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
17 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
21 m->errorOut(e, "ClassifyOtuCommand", "getValidParameters");
25 //**********************************************************************************************************************
26 ClassifyOtuCommand::ClassifyOtuCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["constaxonomy"] = tempOutNames;
33 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
37 //**********************************************************************************************************************
38 vector<string> ClassifyOtuCommand::getRequiredParameters(){
40 string Array[] = {"list","taxonomy"};
41 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45 m->errorOut(e, "ClassifyOtuCommand", "getRequiredParameters");
49 //**********************************************************************************************************************
50 vector<string> ClassifyOtuCommand::getRequiredFiles(){
52 vector<string> myArray;
56 m->errorOut(e, "ClassifyOtuCommand", "getRequiredFiles");
60 //**********************************************************************************************************************
61 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
67 //allow user to run help
68 if (option == "help") {
71 //valid paramters for this command
72 string Array[] = {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
73 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75 OptionParser parser(option);
76 map<string, string> parameters = parser.getParameters();
78 ValidParameters validParameter;
79 map<string, string>::iterator it;
81 //check to make sure all parameters are valid for command
82 for (it = parameters.begin(); it != parameters.end(); it++) {
83 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
86 //initialize outputTypes
87 vector<string> tempOutNames;
88 outputTypes["constaxonomy"] = tempOutNames;
90 //if the user changes the input directory command factory will send this info to us in the output parameter
91 string inputDir = validParameter.validFile(parameters, "inputdir", false);
92 if (inputDir == "not found"){ inputDir = ""; }
95 it = parameters.find("list");
96 //user has given a template file
97 if(it != parameters.end()){
98 path = m->hasPath(it->second);
99 //if the user has not given a path then, add inputdir. else leave path alone.
100 if (path == "") { parameters["list"] = inputDir + it->second; }
103 it = parameters.find("name");
104 //user has given a template file
105 if(it != parameters.end()){
106 path = m->hasPath(it->second);
107 //if the user has not given a path then, add inputdir. else leave path alone.
108 if (path == "") { parameters["name"] = inputDir + it->second; }
111 it = parameters.find("taxonomy");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
121 //if the user changes the output directory command factory will send this info to us in the output parameter
122 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
124 //check for required parameters
125 listfile = validParameter.validFile(parameters, "list", true);
126 if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
127 else if (listfile == "not open") { abort = true; }
129 taxfile = validParameter.validFile(parameters, "taxonomy", true);
130 if (taxfile == "not found") { m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
131 else if (taxfile == "not open") { abort = true; }
133 namefile = validParameter.validFile(parameters, "name", true);
134 if (namefile == "not open") { abort = true; }
135 else if (namefile == "not found") { namefile = ""; }
137 //check for optional parameter and set defaults
138 // ...at some point should added some additional type checking...
139 label = validParameter.validFile(parameters, "label", false);
140 if (label == "not found") { label = ""; allLines = 1; }
142 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
143 else { allLines = 1; }
146 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
147 convert(temp, cutoff);
149 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
150 probs = m->isTrue(temp);
153 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
157 catch(exception& e) {
158 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
163 //**********************************************************************************************************************
165 void ClassifyOtuCommand::help(){
167 m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, label and probs. The taxonomy and list parameters are required.\n");
168 m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
169 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
170 m->mothurOut("The default value for label is all labels in your inputfile.\n");
171 m->mothurOut("The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
172 m->mothurOut("The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
173 m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
174 m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
175 m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
177 catch(exception& e) {
178 m->errorOut(e, "ClassifyOtuCommand", "help");
183 //**********************************************************************************************************************
185 ClassifyOtuCommand::~ClassifyOtuCommand(){}
187 //**********************************************************************************************************************
189 int ClassifyOtuCommand::execute(){
192 if (abort == true) { return 0; }
194 //if user gave a namesfile then use it
195 if (namefile != "") { readNamesFile(); }
197 //read taxonomy file and save in map for easy access in building bin trees
200 if (m->control_pressed) { return 0; }
202 input = new InputData(listfile, "list");
203 list = input->getListVector();
204 string lastLabel = list->getLabel();
206 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
207 set<string> processedLabels;
208 set<string> userLabels = labels;
210 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
212 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
214 if (allLines == 1 || labels.count(list->getLabel()) == 1){
216 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
218 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
220 processedLabels.insert(list->getLabel());
221 userLabels.erase(list->getLabel());
224 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
225 string saveLabel = list->getLabel();
228 list = input->getListVector(lastLabel);
229 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
233 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
235 processedLabels.insert(list->getLabel());
236 userLabels.erase(list->getLabel());
238 //restore real lastlabel to save below
239 list->setLabel(saveLabel);
242 lastLabel = list->getLabel();
245 list = input->getListVector();
248 //output error messages about any remaining user labels
249 bool needToRun = false;
250 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
251 m->mothurOut("Your file does not include the label " + (*it));
252 if (processedLabels.count(lastLabel) != 1) {
253 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
256 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
260 //run last label if you need to
261 if (needToRun == true) {
262 if (list != NULL) { delete list; }
263 list = input->getListVector(lastLabel);
264 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
269 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
274 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
276 m->mothurOutEndLine();
277 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
278 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
279 m->mothurOutEndLine();
283 catch(exception& e) {
284 m->errorOut(e, "ClassifyOtuCommand", "execute");
289 //**********************************************************************************************************************
290 int ClassifyOtuCommand::readNamesFile() {
294 m->openInputFile(namefile, inNames);
298 while(!inNames.eof()){
299 inNames >> name; //read from first column A
300 inNames >> names; //read from second column A,B,C,D
303 //parse names into vector
304 vector<string> theseNames;
305 m->splitAtComma(names, theseNames);
307 for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
309 if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
315 catch(exception& e) {
316 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
320 //**********************************************************************************************************************
321 int ClassifyOtuCommand::readTaxonomyFile() {
325 m->openInputFile(taxfile, in);
333 //are there confidence scores, if so remove them
334 if (tax.find_first_of('(') != -1) { removeConfidences(tax); }
338 if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
344 catch(exception& e) {
345 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
349 //**********************************************************************************************************************
350 string ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size) {
353 vector<string> names;
354 map<string, string>::iterator it;
355 map<string, string>::iterator it2;
357 //parse names into vector
358 string binnames = thisList->get(bin);
359 m->splitAtComma(binnames, names);
361 //create a tree containing sequences from this bin
362 PhyloTree* phylo = new PhyloTree();
365 for (int i = 0; i < names.size(); i++) {
367 //if namesfile include the names
368 if (namefile != "") {
370 //is this sequence in the name file - namemap maps seqName -> repSeqName
371 it2 = nameMap.find(names[i]);
373 if (it2 == nameMap.end()) { //this name is not in name file, skip it
374 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
377 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
378 it = taxMap.find(it2->second);
380 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
382 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
383 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
387 phylo->addSeqToTree(names[i], it->second);
393 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
394 it = taxMap.find(names[i]);
396 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
397 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
400 phylo->addSeqToTree(names[i], it->second);
406 if (m->control_pressed) { delete phylo; return conTax; }
411 phylo->assignHeirarchyIDs(0);
413 TaxNode currentNode = phylo->get(0);
416 while (currentNode.children.size() != 0) { //you still have more to explore
419 int bestChildSize = 0;
421 //go through children
422 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
424 TaxNode temp = phylo->get(itChild->second);
426 //select child with largest accesions - most seqs assigned to it
427 if (temp.accessions.size() > bestChildSize) {
428 bestChild = phylo->get(itChild->second);
429 bestChildSize = temp.accessions.size();
434 //is this taxonomy above cutoff
435 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
437 if (consensusConfidence >= cutoff) { //if yes, add it
439 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
441 conTax += bestChild.name + ";";
448 currentNode = bestChild;
452 if (conTax == "") { conTax = "no_consensus;"; }
459 catch(exception& e) {
460 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
465 //**********************************************************************************************************************
466 int ClassifyOtuCommand::process(ListVector* processList) {
472 if (outputDir == "") { outputDir += m->hasPath(listfile); }
475 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
476 m->openOutputFile(outputFile, out);
477 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
479 out << "OTU\tSize\tTaxonomy" << endl;
481 //for each bin in the list vector
482 for (int i = 0; i < processList->getNumBins(); i++) {
484 conTax = findConsensusTaxonomy(i, processList, size);
486 if (m->control_pressed) { out.close(); return 0; }
488 //output to new names file
489 out << (i+1) << '\t' << size << '\t' << conTax << endl;
497 catch(exception& e) {
498 m->errorOut(e, "ClassifyOtuCommand", "process");
502 /**************************************************************************************************/
503 void ClassifyOtuCommand::removeConfidences(string& tax) {
509 while (tax.find_first_of(';') != -1) {
511 taxon = tax.substr(0,tax.find_first_of(';'));
513 int pos = taxon.find_first_of('(');
515 taxon = taxon.substr(0, pos); //rip off confidence
520 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
526 catch(exception& e) {
527 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
531 //**********************************************************************************************************************