From: westcott Date: Wed, 2 Jun 2010 14:03:39 +0000 (+0000) Subject: added classify.otu command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=fc9c3ccee1fba93d9df72c26baa22628268fe79f added classify.otu command --- diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp new file mode 100644 index 0000000..5e7e787 --- /dev/null +++ b/classifyotucommand.cpp @@ -0,0 +1,469 @@ +/* + * classifyotucommand.cpp + * Mothur + * + * Created by westcott on 6/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "classifyotucommand.h" +#include "phylotree.h" + + +//********************************************************************************************************************** +ClassifyOtuCommand::ClassifyOtuCommand(string option) { + try{ + abort = false; + allLines = 1; + labels.clear(); + + //allow user to run help + if (option == "help") { + help(); abort = true; + } else { + //valid paramters for this command + string Array[] = {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + } + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //check for required parameters + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; } + else if (listfile == "not open") { abort = true; } + + taxfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxfile == "not found") { m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; } + else if (taxfile == "not open") { abort = true; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; allLines = 1; } + else { + if(label != "all") { splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + + string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; } + convert(temp, cutoff); + + temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } + probs = isTrue(temp); + + + if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } + + } + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +void ClassifyOtuCommand::help(){ + try { + m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, and label. The taxonomy and list parameters are required.\n"); + m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n"); + m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"); + m->mothurOut("The default value for label is all labels in your inputfile.\n"); + m->mothurOut("The cutoff parameter allows you to specify a concensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n"); + m->mothurOut("The probs parameter shuts off the outputting of the concensus confidence results. The default is true, meaning you want the confidence to be shown.\n"); + m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n"); + m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +ClassifyOtuCommand::~ClassifyOtuCommand(){} + +//********************************************************************************************************************** + +int ClassifyOtuCommand::execute(){ + try { + + if (abort == true) { return 0; } + + //if user gave a namesfile then use it + if (namefile != "") { readNamesFile(); } + + //read taxonomy file and save in map for easy access in building bin trees + readTaxonomyFile(); + + if (m->control_pressed) { return 0; } + + input = new InputData(listfile, "list"); + list = input->getListVector(); + string lastLabel = list->getLabel(); + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (allLines == 1 || labels.count(list->getLabel()) == 1){ + + m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine(); + process(list); + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; } + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + list = input->getListVector(lastLabel); + m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine(); + process(list); + + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; } + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + } + + lastLabel = list->getLabel(); + + delete list; + list = input->getListVector(); + } + + //output error messages about any remaining user labels + bool needToRun = false; + for (set::iterator it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + (*it)); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + list = input->getListVector(lastLabel); + m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine(); + + process(list); + delete list; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; } + } + + delete input; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +int ClassifyOtuCommand::readNamesFile() { + try { + + ifstream inNames; + openInputFile(namefile, inNames); + + string name, names; + + while(inNames){ + inNames >> name; //read from first column A + inNames >> names; //read from second column A,B,C,D + gobble(inNames); + + nameMap[name] = names; + + if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; } + } + inNames.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "readNamesFile"); + exit(1); + } +} +//********************************************************************************************************************** +int ClassifyOtuCommand::readTaxonomyFile() { + try { + + ifstream in; + openInputFile(taxfile, in); + + string name, tax; + + while(!in.eof()){ + in >> name >> tax; + gobble(in); + + //are there confidence scores, if so remove them + if (tax.find_first_of('(') != -1) { removeConfidences(tax); } + + taxMap[name] = tax; + + if (m->control_pressed) { in.close(); taxMap.clear(); return 0; } + } + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifyOtuCommand::findConcensusTaxonomy(int bin, ListVector* thisList, int& size) { + try{ + string conTax = ""; + vector names; + map::iterator it; + map::iterator it2; + + //parse names into vector + string binnames = thisList->get(bin); + splitAtComma(binnames, names); + + //create a tree containing sequences from this bin + PhyloTree* phylo = new PhyloTree(); + + size = 0; + for (int i = 0; i < names.size(); i++) { + + if (m->control_pressed) { delete phylo; return conTax; } + + //is this sequence in the taxonomy file + it = taxMap.find(names[i]); + + if (it == taxMap.end()) { //this name is not in taxonomy file, skip it + m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the concensus."); m->mothurOutEndLine(); + }else{ + + //if namesfile include the names + if (namefile != "") { + //is this sequence in the name file + it2 = nameMap.find(names[i]); + + if (it2 == nameMap.end()) { //this name is not in name file, skip it + m->mothurOut(names[i] + " is not in your name file. I will not include it in the concensus."); m->mothurOutEndLine(); + }else{ + + vector nameFileNames; + splitAtComma(it2->second, nameFileNames); + + for (int j = 0; j < nameFileNames.size(); j++) { + //add seq to tree + phylo->addSeqToTree(nameFileNames[j], it->second); + size++; + } + } + + }else{ + //add seq to tree + phylo->addSeqToTree(names[i], it->second); + size++; + } + } + } + + //build tree + phylo->assignHeirarchyIDs(0); + + TaxNode currentNode = phylo->get(0); + + //at each level + while (currentNode.children.size() != 0) { //you still have more to explore + + TaxNode bestChild; + int bestChildSize = 0; + + //go through children + for (map::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) { + + TaxNode temp = phylo->get(itChild->second); + + //select child with largest accesions - most seqs assigned to it + if (temp.accessions.size() > bestChildSize) { + bestChild = phylo->get(itChild->second); + bestChildSize = temp.accessions.size(); + } + + } + + //is this taxonomy above cutoff + int concensusConfidence = ceil((bestChildSize / (float) size) * 100); + + if (concensusConfidence >= cutoff) { //if yes, add it + if (probs) { + conTax += bestChild.name + "(" + toString(concensusConfidence) + ");"; + }else{ + conTax += bestChild.name + ";"; + } + }else{ //if no, quit + break; + } + + //move down a level + currentNode = bestChild; + } + + + if (conTax == "") { conTax = "unclassified;"; } + + delete phylo; + + return conTax; + + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "findConcensusTaxonomy"); + exit(1); + } +} + +//********************************************************************************************************************** +int ClassifyOtuCommand::process(ListVector* processList) { + try{ + string conTax; + int size; + + //create output file + if (outputDir == "") { outputDir += hasPath(listfile); } + + ofstream out; + string outputFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy"; + openOutputFile(outputFile, out); + outputNames.push_back(outputFile); + + //for each bin in the list vector + for (int i = 0; i < processList->getNumBins(); i++) { + conTax = findConcensusTaxonomy(i, processList, size); + + if (m->control_pressed) { out.close(); return 0; } + + //output to new names file + out << (i+1) << '\t' << size << '\t' << conTax << endl; + } + + out.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "process"); + exit(1); + } +} +/**************************************************************************************************/ +void ClassifyOtuCommand::removeConfidences(string& tax) { + try { + + string taxon; + string newTax = ""; + + while (tax.find_first_of(';') != -1) { + //get taxon + taxon = tax.substr(0,tax.find_first_of(';')); + + int pos = taxon.find_first_of('('); + if (pos != -1) { + taxon = taxon.substr(0, pos); //rip off confidence + } + + taxon += ";"; + + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + newTax += taxon; + } + + tax = newTax; + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "removeConfidences"); + exit(1); + } +} +//********************************************************************************************************************** + + diff --git a/classifyotucommand.h b/classifyotucommand.h new file mode 100644 index 0000000..0ac0288 --- /dev/null +++ b/classifyotucommand.h @@ -0,0 +1,49 @@ +#ifndef CLASSIFYOTUSCOMMAND_H +#define CLASSIFYOTUSCOMMAND_H + +/* + * classifyotucommand.h + * Mothur + * + * Created by westcott on 6/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" +#include "listvector.hpp" +#include "inputdata.h" + + +class ClassifyOtuCommand : public Command { + +public: + ClassifyOtuCommand(string); + ~ClassifyOtuCommand(); + int execute(); + void help(); + +private: + + ListVector* list; + InputData* input; + string listfile, namefile, taxfile, label, outputDir; + bool abort, allLines, probs; + int cutoff; + set labels; //holds labels to be used + vector outputNames; + map nameMap; + map taxMap; + + int readNamesFile(); + int readTaxonomyFile(); + void removeConfidences(string&); + int process(ListVector*); + string findConcensusTaxonomy(int, ListVector*, int&); // returns the name of the "representative" taxonomy of given bin + + +}; + +#endif + +