From: westcott Date: Fri, 23 Sep 2011 15:37:40 +0000 (+0000) Subject: added summary.tax command and fixed bug with root level totals in tax.summary file X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=763d07b1c215b1bdc9d5d63431f78cfecc60acf5 added summary.tax command and fixed bug with root level totals in tax.summary file --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 3ccc0b9..b09e93b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -324,6 +324,7 @@ A7FE7C401330EA1000F7B327 /* getcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */; }; A7FE7E6D13311EA400F7B327 /* setcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */; }; A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FF19F1140FFDA500AD216D /* trimoligos.cpp */; }; + A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FFB557142CA02C004884F2 /* summarytaxcommand.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -1001,6 +1002,8 @@ A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setcurrentcommand.cpp; sourceTree = ""; }; A7FF19F0140FFDA500AD216D /* trimoligos.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trimoligos.h; sourceTree = ""; }; A7FF19F1140FFDA500AD216D /* trimoligos.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trimoligos.cpp; sourceTree = ""; }; + A7FFB556142CA02C004884F2 /* summarytaxcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = summarytaxcommand.h; sourceTree = ""; }; + A7FFB557142CA02C004884F2 /* summarytaxcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = summarytaxcommand.cpp; sourceTree = ""; }; C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = ""; }; /* End PBXFileReference section */ @@ -1390,9 +1393,9 @@ A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */, A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */, A7E9B7F312D37EC400DA6239 /* sharedcommand.h */, + A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B82812D37EC400DA6239 /* shhhercommand.h */, A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */, - A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B84012D37EC400DA6239 /* splitabundcommand.h */, A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */, A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */, @@ -1403,6 +1406,8 @@ A7E9B85712D37EC400DA6239 /* summarycommand.cpp */, A7E9B85A12D37EC400DA6239 /* summarysharedcommand.h */, A7E9B85912D37EC400DA6239 /* summarysharedcommand.cpp */, + A7FFB556142CA02C004884F2 /* summarytaxcommand.h */, + A7FFB557142CA02C004884F2 /* summarytaxcommand.cpp */, A7E9B85C12D37EC400DA6239 /* systemcommand.h */, A7E9B85B12D37EC400DA6239 /* systemcommand.cpp */, A7E9B86312D37EC400DA6239 /* treegroupscommand.h */, @@ -2149,6 +2154,7 @@ A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */, A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */, A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */, + A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/commandfactory.cpp b/commandfactory.cpp index 8e61e6e..b826167 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -122,6 +122,7 @@ #include "countseqscommand.h" #include "countgroupscommand.h" #include "clearmemorycommand.h" +#include "summarytaxcommand.h" /*******************************************************/ @@ -265,6 +266,7 @@ CommandFactory::CommandFactory(){ commands["shhh.seqs"] = "MPIEnabled"; commands["sens.spec"] = "sens.spec"; commands["seq.error"] = "seq.error"; + commands["seq.error"] = "summary.tax"; commands["quit"] = "MPIEnabled"; } @@ -421,6 +423,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } + else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -561,6 +564,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } + else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -689,6 +693,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } + else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/mothurout.cpp b/mothurout.cpp index 45d1176..df1a1c0 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1349,7 +1349,35 @@ int MothurOut::readNames(string namefile, map& nameMap) { exit(1); } } - +/**********************************************************************************************************************/ +int MothurOut::readNames(string namefile, map >& nameMap) { + try { + + //open input file + ifstream in; + openInputFile(namefile, in); + + while (!in.eof()) { + if (control_pressed) { break; } + + string firstCol, secondCol; + in >> firstCol >> secondCol; gobble(in); + + vector temp; + splitAtComma(secondCol, temp); + + nameMap[firstCol] = temp; + } + in.close(); + + return 0; + + } + catch(exception& e) { + errorOut(e, "MothurOut", "readNames"); + exit(1); + } +} /**********************************************************************************************************************/ map MothurOut::readNames(string namefile) { try { diff --git a/mothurout.h b/mothurout.h index 3019386..0c45a30 100644 --- a/mothurout.h +++ b/mothurout.h @@ -81,6 +81,7 @@ class MothurOut { void gobble(istringstream&); map readNames(string); int readNames(string, map&); + int readNames(string, map >&); int readNames(string, vector&, map&); void mothurRemove(string); diff --git a/phylosummary.cpp b/phylosummary.cpp index 58274b3..47591f6 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -71,7 +71,7 @@ PhyloSummary::PhyloSummary(string groupFile){ } /**************************************************************************************************/ -void PhyloSummary::summarize(string userTfile){ +int PhyloSummary::summarize(string userTfile){ try { ifstream in; @@ -79,14 +79,18 @@ void PhyloSummary::summarize(string userTfile){ //read in users taxonomy file and add sequences to tree string name, tax; + int numSeqs = 0; while(!in.eof()){ in >> name >> tax; m->gobble(in); addSeqToTree(name, tax); + numSeqs++; if (m->control_pressed) { break; } } in.close(); + + return numSeqs; } catch(exception& e) { m->errorOut(e, "PhyloSummary", "summarize"); @@ -126,6 +130,9 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ int level = 0; + //are there confidence scores, if so remove them + if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); } + while (seqTaxonomy != "") { if (m->control_pressed) { return 0; } @@ -221,6 +228,9 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ int level = 0; + //are there confidence scores, if so remove them + if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); } + while (seqTaxonomy != "") { if (m->control_pressed) { return 0; } @@ -361,16 +371,25 @@ void PhyloSummary::print(ofstream& out){ out << endl; int totalChildrenInTree = 0; + map::iterator itGroup; map::iterator it; for(it=tree[0].children.begin();it!=tree[0].children.end();it++){ - if (tree[it->second].total != 0) { totalChildrenInTree++; } + if (tree[it->second].total != 0) { + totalChildrenInTree++; + tree[0].total += tree[it->second].total; + + if (groupmap != NULL) { + vector mGroups = groupmap->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } + } + } } //print root out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t"; - map::iterator itGroup; + if (groupmap != NULL) { //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { // out << itGroup->second << '\t'; @@ -472,11 +491,39 @@ void PhyloSummary::readTreeStruct(ifstream& in){ } catch(exception& e) { - m->errorOut(e, "PhyloSummary", "print"); + m->errorOut(e, "PhyloSummary", "readTreeStruct"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloSummary::removeConfidences(string& tax) { + try { + + string taxon; + string newTax = ""; + + while (tax.find_first_of(';') != -1) { + //get taxon + taxon = tax.substr(0,tax.find_first_of(';')); + + int pos = taxon.find_first_of('('); + if (pos != -1) { + taxon = taxon.substr(0, pos); //rip off confidence + } + + taxon += ";"; + + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + newTax += taxon; + } + + tax = newTax; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "removeConfidences"); exit(1); } } - /**************************************************************************************************/ diff --git a/phylosummary.h b/phylosummary.h index 04ba65f..3adcc3f 100644 --- a/phylosummary.h +++ b/phylosummary.h @@ -36,7 +36,7 @@ public: PhyloSummary(string, string); ~PhyloSummary() { if (groupmap != NULL) { delete groupmap; } } - void summarize(string); //pass it a taxonomy file and a group file and it makes the tree + int summarize(string); //pass it a taxonomy file and a group file and it makes the tree int addSeqToTree(string, string); int addSeqToTree(string, vector); void print(ofstream&); @@ -55,6 +55,8 @@ private: int numSeqs; int maxLevel; MothurOut* m; + + void removeConfidences(string&); }; /**************************************************************************************************/ diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp new file mode 100644 index 0000000..a19986a --- /dev/null +++ b/summarytaxcommand.cpp @@ -0,0 +1,247 @@ +/* + * summarytaxcommand.cpp + * Mothur + * + * Created by westcott on 9/23/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "summarytaxcommand.h" +#include "phylosummary.h" + +//********************************************************************************************************************** +vector SummaryTaxCommand::setParameters(){ + try { + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SummaryTaxCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string SummaryTaxCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The summary.tax command reads a taxonomy file and an optional name file, and summarizes the taxonomy information.\n"; + helpString += "The summary.tax command parameters are taxonomy, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n"; + helpString += "The name parameter allows you to enter a name file associated with your taxonomy file. \n"; + helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; + helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. It is not required, but providing it will keep the rankIDs in the summary file static.\n"; + helpString += "The summary.tax command should be in the following format: \n"; + helpString += "summary.tax(taxonomy=yourTaxonomyFile) \n"; + helpString += "Note: No spaces between parameter labels (i.e. taxonomy), '=' and parameters (i.e.yourTaxonomyFile).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "SummaryTaxCommand", "getHelpString"); + exit(1); + } +} + +//********************************************************************************************************************** +SummaryTaxCommand::SummaryTaxCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "SummaryTaxCommand", "SummaryTaxCommand"); + exit(1); + } +} +//*************************************************************************************************************** + +SummaryTaxCommand::SummaryTaxCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("reftaxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; } + } + + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + + //check for required parameters + taxfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxfile == "not open") { abort = true; } + else if (taxfile == "not found") { + taxfile = m->getTaxonomyFile(); + if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setTaxonomyFile(taxfile); } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { namefile = ""; abort = true; } + else if (namefile == "not found") { namefile = ""; } + else { m->setNameFile(namefile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } + + refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true); + if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); } + else if (refTaxonomy == "not open") { refTaxonomy = ""; abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it + } + } + } + catch(exception& e) { + m->errorOut(e, "SummaryTaxCommand", "SummaryTaxCommand"); + exit(1); + } +} +//*************************************************************************************************************** + +int SummaryTaxCommand::execute(){ + try{ + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + int start = time(NULL); + + PhyloSummary* taxaSum; + if (refTaxonomy != "") { + taxaSum = new PhyloSummary(refTaxonomy, groupfile); + }else { + taxaSum = new PhyloSummary(groupfile); + } + + if (m->control_pressed) { delete taxaSum; return 0; } + + int numSeqs = 0; + if (namefile == "") { numSeqs = taxaSum->summarize(taxfile); } + else { + map > nameMap; + map >::iterator itNames; + m->readNames(namefile, nameMap); + + if (m->control_pressed) { delete taxaSum; return 0; } + + ifstream in; + m->openInputFile(taxfile, in); + + //read in users taxonomy file and add sequences to tree + string name, taxon; + + while(!in.eof()){ + in >> name >> taxon; m->gobble(in); + + itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { + m->mothurOut("[ERROR]: " + name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); + }else{ + for (int i = 0; i < itNames->second.size(); i++) { + numSeqs++; + taxaSum->addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs + } + itNames->second.clear(); + nameMap.erase(itNames->first); + } + } + in.close(); + } + + if (m->control_pressed) { delete taxaSum; return 0; } + + //print summary file + ofstream outTaxTree; + string summaryFile = outputDir + m->getRootName(m->getSimpleName(taxfile)) + "tax.summary"; + m->openOutputFile(summaryFile, outTaxTree); + taxaSum->print(outTaxTree); + outTaxTree.close(); + + delete taxaSum; + + if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } + + m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Output File Name: "); m->mothurOutEndLine(); + m->mothurOut(summaryFile); m->mothurOutEndLine(); outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile); + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SummaryTaxCommand", "execute"); + exit(1); + } +} +/**************************************************************************************/ + + diff --git a/summarytaxcommand.h b/summarytaxcommand.h new file mode 100644 index 0000000..acef7ee --- /dev/null +++ b/summarytaxcommand.h @@ -0,0 +1,40 @@ +#ifndef SUMMARYTAXCOMMAND_H +#define SUMMARYTAXCOMMAND_H + +/* + * summarytaxcommand.h + * Mothur + * + * Created by westcott on 9/23/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +/**************************************************************************************************/ + +class SummaryTaxCommand : public Command { + public: + SummaryTaxCommand(string); + SummaryTaxCommand(); + ~SummaryTaxCommand(){} + + vector setParameters(); + string getCommandName() { return "summary.tax"; } + string getCommandCategory() { return "Phylotype Analysis"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Summary.tax"; } + string getDescription() { return "summarize the taxonomies of a set of sequences"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + + private: + bool abort; + string taxfile, outputDir, namefile, groupfile, refTaxonomy; + vector outputNames; + map nameMap; +}; + +#endif