]> git.donarmstrong.com Git - mothur.git/commitdiff
added classify.otu command
authorwestcott <westcott>
Wed, 2 Jun 2010 14:03:39 +0000 (14:03 +0000)
committerwestcott <westcott>
Wed, 2 Jun 2010 14:03:39 +0000 (14:03 +0000)
classifyotucommand.cpp [new file with mode: 0644]
classifyotucommand.h [new file with mode: 0644]

diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp
new file mode 100644 (file)
index 0000000..5e7e787
--- /dev/null
@@ -0,0 +1,469 @@
+/*
+ *  classifyotucommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 6/1/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classifyotucommand.h"
+#include "phylotree.h"
+
+
+//**********************************************************************************************************************
+ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
+       try{
+               abort = false;
+               allLines = 1;
+               labels.clear();
+                               
+               //allow user to run help
+               if (option == "help") { 
+                       help(); abort = true;
+               } else {
+                       //valid paramters for this command
+                       string Array[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string, string>::iterator it;
+               
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("list");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["list"] = inputDir + it->second;             }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                               
+                               it = parameters.find("taxonomy");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
+                               }
+                       }
+
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
+                       
+                       //check for required parameters
+                       listfile = validParameter.validFile(parameters, "list", true);
+                       if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
+                       else if (listfile == "not open") { abort = true; }      
+                       
+                       taxfile = validParameter.validFile(parameters, "taxonomy", true);
+                       if (taxfile == "not found") {  m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
+                       else if (taxfile == "not open") { abort = true; }       
+       
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not open") { abort = true; }   
+                       else if (namefile == "not found") { namefile = ""; }
+                       
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       label = validParameter.validFile(parameters, "label", false);                   
+                       if (label == "not found") { label = ""; allLines = 1;  }
+                       else { 
+                               if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+                       
+                       string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
+                       convert(temp, cutoff); 
+                       
+                       temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
+                       probs = isTrue(temp);
+                       
+                       
+                       if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
+                       
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+void ClassifyOtuCommand::help(){
+       try {
+               m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, and label.  The taxonomy and list parameters are required.\n");
+               m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
+               m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
+               m->mothurOut("The default value for label is all labels in your inputfile.\n");
+               m->mothurOut("The cutoff parameter allows you to specify a concensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
+               m->mothurOut("The probs parameter shuts off the outputting of the concensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
+               m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
+               m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
+               m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "help");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+ClassifyOtuCommand::~ClassifyOtuCommand(){}
+
+//**********************************************************************************************************************
+
+int ClassifyOtuCommand::execute(){
+       try {
+       
+               if (abort == true) { return 0; }
+               
+               //if user gave a namesfile then use it
+               if (namefile != "") {   readNamesFile();        }
+               
+               //read taxonomy file and save in map for easy access in building bin trees
+               readTaxonomyFile();
+               
+               if (m->control_pressed) { return 0; }
+               
+               input = new InputData(listfile, "list");
+               list = input->getListVector();
+               string lastLabel = list->getLabel();
+
+               //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+               set<string> processedLabels;
+               set<string> userLabels = labels;
+               
+               if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {     remove(outputNames[i].c_str());  }  return 0; }
+       
+               while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+                       
+                       if (allLines == 1 || labels.count(list->getLabel()) == 1){
+                       
+                                       m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
+                                       process(list);
+                                       if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
+                                                                               
+                                       processedLabels.insert(list->getLabel());
+                                       userLabels.erase(list->getLabel());
+                       }
+                       
+                       if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                                       string saveLabel = list->getLabel();
+                                       
+                                       delete list;
+                                       list = input->getListVector(lastLabel);
+                                       m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
+                                       process(list);
+                               
+                                       
+                                       if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
+                                                                               
+                                       processedLabels.insert(list->getLabel());
+                                       userLabels.erase(list->getLabel());
+                                       
+                                       //restore real lastlabel to save below
+                                       list->setLabel(saveLabel);
+                       }
+                       
+                       lastLabel = list->getLabel();
+       
+                       delete list;
+                       list = input->getListVector();
+               }
+               
+               //output error messages about any remaining user labels
+               bool needToRun = false;
+               for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
+                       m->mothurOut("Your file does not include the label " + (*it)); 
+                       if (processedLabels.count(lastLabel) != 1) {
+                               m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                               needToRun = true;
+                       }else {
+                               m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+                       }
+               }
+               
+               //run last label if you need to
+               if (needToRun == true)  {
+                       if (list != NULL) {     delete list;    }
+                       list = input->getListVector(lastLabel);
+                       m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
+                       
+                       process(list);
+                       delete list;
+                       
+                       if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
+               }
+               
+               delete input;  
+                               
+               if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str());  } return 0; }
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "execute");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+int ClassifyOtuCommand::readNamesFile() {
+       try {
+               
+               ifstream inNames;
+               openInputFile(namefile, inNames);
+               
+               string name, names;
+       
+               while(inNames){
+                       inNames >> name;                        //read from first column  A
+                       inNames >> names;               //read from second column  A,B,C,D
+                       gobble(inNames);
+                       
+                       nameMap[name] = names;
+                       
+                       if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
+               }
+               inNames.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ClassifyOtuCommand::readTaxonomyFile() {
+       try {
+               
+               ifstream in;
+               openInputFile(taxfile, in);
+               
+               string name, tax;
+       
+               while(!in.eof()){
+                       in >> name >> tax;              
+                       gobble(in);
+                       
+                       //are there confidence scores, if so remove them
+                       if (tax.find_first_of('(') != -1) {  removeConfidences(tax);    }
+                       
+                       taxMap[name] = tax;
+                       
+                       if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
+               }
+               in.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ClassifyOtuCommand::findConcensusTaxonomy(int bin, ListVector* thisList, int& size) {
+       try{
+               string conTax = "";
+               vector<string> names;
+               map<string, string>::iterator it;
+               map<string, string>::iterator it2;
+
+               //parse names into vector
+               string binnames = thisList->get(bin);
+               splitAtComma(binnames, names);
+
+               //create a tree containing sequences from this bin
+               PhyloTree* phylo = new PhyloTree();
+               
+               size = 0;
+               for (int i = 0; i < names.size(); i++) {
+                       
+                       if (m->control_pressed) { delete phylo; return conTax; }
+                       
+                       //is this sequence in the taxonomy file
+                       it = taxMap.find(names[i]);
+                       
+                       if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
+                               m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the concensus."); m->mothurOutEndLine();
+                       }else{
+                               
+                               //if namesfile include the names
+                               if (namefile != "") {
+                                       //is this sequence in the name file
+                                       it2 = nameMap.find(names[i]);
+                                       
+                                       if (it2 == nameMap.end()) { //this name is not in name file, skip it
+                                               m->mothurOut(names[i] + " is not in your name file.  I will not include it in the concensus."); m->mothurOutEndLine();
+                                       }else{
+                                               
+                                               vector<string> nameFileNames;
+                                               splitAtComma(it2->second, nameFileNames);
+                                               
+                                               for (int j = 0; j < nameFileNames.size(); j++) {
+                                                       //add seq to tree
+                                                       phylo->addSeqToTree(nameFileNames[j], it->second);
+                                                       size++;
+                                               }
+                                       }
+                                       
+                               }else{
+                                       //add seq to tree
+                                       phylo->addSeqToTree(names[i], it->second);
+                                       size++;
+                               }
+                       }
+               }
+               
+               //build tree
+               phylo->assignHeirarchyIDs(0);
+               
+               TaxNode currentNode = phylo->get(0);
+               
+               //at each level
+               while (currentNode.children.size() != 0) { //you still have more to explore
+               
+                       TaxNode bestChild;
+                       int bestChildSize = 0;
+                       
+                       //go through children
+                       for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
+                               
+                               TaxNode temp = phylo->get(itChild->second);
+                               
+                               //select child with largest accesions - most seqs assigned to it
+                               if (temp.accessions.size() > bestChildSize) {
+                                       bestChild = phylo->get(itChild->second);
+                                       bestChildSize = temp.accessions.size();
+                               }
+                               
+                       }
+                               
+                       //is this taxonomy above cutoff
+                       int concensusConfidence = ceil((bestChildSize / (float) size) * 100);
+                       
+                       if (concensusConfidence >= cutoff) { //if yes, add it
+                               if (probs) {
+                                       conTax += bestChild.name + "(" + toString(concensusConfidence) + ");";
+                               }else{
+                                       conTax += bestChild.name + ";";
+                               }
+                       }else{ //if no, quit
+                               break;
+                       }
+                       
+                       //move down a level
+                       currentNode = bestChild;
+               }
+               
+                               
+               if (conTax == "") {  conTax = "unclassified;";  }
+               
+               delete phylo;   
+               
+               return conTax;
+                       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "findConcensusTaxonomy");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+int ClassifyOtuCommand::process(ListVector* processList) {
+       try{
+               string conTax;
+               int size;
+               
+               //create output file
+               if (outputDir == "") { outputDir += hasPath(listfile); }
+                               
+               ofstream out;
+               string outputFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
+               openOutputFile(outputFile, out);
+               outputNames.push_back(outputFile);
+               
+               //for each bin in the list vector
+               for (int i = 0; i < processList->getNumBins(); i++) {
+                       conTax  = findConcensusTaxonomy(i, processList, size);
+                       
+                       if (m->control_pressed) { out.close();  return 0; }
+                       
+                       //output to new names file
+                       out << (i+1) << '\t' << size << '\t' << conTax << endl;
+               }
+
+               out.close();
+               
+               return 0;
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "process");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void ClassifyOtuCommand::removeConfidences(string& tax) {
+       try {
+               
+               string taxon;
+               string newTax = "";
+               
+               while (tax.find_first_of(';') != -1) {
+                       //get taxon
+                       taxon = tax.substr(0,tax.find_first_of(';'));
+                       
+                       int pos = taxon.find_first_of('(');
+                       if (pos != -1) {
+                               taxon = taxon.substr(0, pos); //rip off confidence 
+                       }
+                       
+                       taxon += ";";
+                       
+                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+                       newTax += taxon;
+               }
+               
+               tax = newTax;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+
diff --git a/classifyotucommand.h b/classifyotucommand.h
new file mode 100644 (file)
index 0000000..0ac0288
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef CLASSIFYOTUSCOMMAND_H
+#define CLASSIFYOTUSCOMMAND_H
+
+/*
+ *  classifyotucommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 6/1/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+#include "command.hpp"
+#include "listvector.hpp"
+#include "inputdata.h"
+
+
+class ClassifyOtuCommand : public Command {
+
+public:
+       ClassifyOtuCommand(string);
+       ~ClassifyOtuCommand();
+       int execute();
+       void help();
+
+private:
+
+       ListVector* list;
+       InputData* input;
+       string listfile, namefile, taxfile, label, outputDir;
+       bool abort, allLines, probs;
+       int cutoff;
+       set<string> labels; //holds labels to be used
+       vector<string> outputNames;
+       map<string, string> nameMap;
+       map<string, string> taxMap;
+
+       int readNamesFile();
+       int readTaxonomyFile();
+       void removeConfidences(string&);
+       int process(ListVector*);
+       string findConcensusTaxonomy(int, ListVector*, int&);   // returns the name of the "representative" taxonomy of given bin
+       
+                                                                                               
+};
+
+#endif
+
+