From cea46f0830d8d1a6dc26e3410fc283d26c355831 Mon Sep 17 00:00:00 2001 From: westcott Date: Tue, 21 Sep 2010 10:53:52 +0000 Subject: [PATCH] added split.groups command --- Mothur.xcodeproj/project.pbxproj | 4 + commandfactory.cpp | 3 + mothurout.cpp | 17 +- splitgroupscommand.cpp | 280 +++++++++++++++++++++++++++++++ splitgroupscommand.h | 51 ++++++ 5 files changed, 353 insertions(+), 2 deletions(-) create mode 100644 splitgroupscommand.cpp create mode 100644 splitgroupscommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c59b637..1ac6e0b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -486,6 +486,8 @@ A7DF0AE2121EBB14004A03EA /* prng.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prng.h; sourceTree = ""; }; A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsesffcommand.cpp; sourceTree = ""; }; A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsesffcommand.h; sourceTree = ""; }; + A7F139481247C3CB0033324C /* splitgroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = splitgroupscommand.h; sourceTree = ""; }; + A7F139491247C3CB0033324C /* splitgroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = splitgroupscommand.cpp; sourceTree = ""; }; A7F6C8E1124229F900299875 /* fisher2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fisher2.c; sourceTree = ""; }; A7F6C8E2124229F900299875 /* fisher2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fisher2.h; sourceTree = ""; }; A7F6C8E3124229F900299875 /* metastats2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = metastats2.c; sourceTree = ""; }; @@ -856,6 +858,8 @@ A7DA210F113FECD400BF472F /* sharedcommand.cpp */, A71D924511AEB42400D00CBC /* splitabundcommand.h */, A71D924411AEB42400D00CBC /* splitabundcommand.cpp */, + A7F139481247C3CB0033324C /* splitgroupscommand.h */, + A7F139491247C3CB0033324C /* splitgroupscommand.cpp */, A7DA2155113FECD400BF472F /* summarycommand.h */, A7DA2154113FECD400BF472F /* summarycommand.cpp */, A7DA2159113FECD400BF472F /* summarysharedcommand.h */, diff --git a/commandfactory.cpp b/commandfactory.cpp index bbdd616..45f8906 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -87,6 +87,7 @@ #include "seqerrorcommand.h" #include "normalizesharedcommand.h" #include "metastatscommand.h" +#include "splitgroupscommand.h" /*******************************************************/ @@ -178,6 +179,7 @@ CommandFactory::CommandFactory(){ commands["sffinfo"] = "sffinfo"; commands["normalize.shared"] = "normalize.shared"; commands["metastats"] = "metastats"; + commands["split.groups"] = "split.groups"; commands["classify.seqs"] = "MPIEnabled"; commands["dist.seqs"] = "MPIEnabled"; commands["filter.seqs"] = "MPIEnabled"; @@ -312,6 +314,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "sffinfo") { command = new SffInfoCommand(optionString); } else if(commandName == "normalize.shared") { command = new NormalizeSharedCommand(optionString); } else if(commandName == "metastats") { command = new MetaStatsCommand(optionString); } + else if(commandName == "split.groups") { command = new SplitGroupCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/mothurout.cpp b/mothurout.cpp index 1231700..aab9945 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1182,7 +1182,20 @@ void MothurOut::getNumSeqs(ifstream& file, int& numSeqs){ //This function parses the estimator options and puts them in a vector void MothurOut::splitAtChar(string& estim, vector& container, char symbol) { try { - string individual; + string individual = ""; + int estimLength = estim.size(); + for(int i=0;i& container, char symbo } } //get last one - container.push_back(estim); + container.push_back(estim); */ } catch(exception& e) { errorOut(e, "MothurOut", "splitAtChar"); diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp new file mode 100644 index 0000000..6f8788a --- /dev/null +++ b/splitgroupscommand.cpp @@ -0,0 +1,280 @@ +/* + * splitgroupscommand.cpp + * Mothur + * + * Created by westcott on 9/20/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "splitgroupscommand.h" +#include "sharedutilities.h" + +//********************************************************************************************************************** +SplitGroupCommand::SplitGroupCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"name","group","groups","fasta","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + } + + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the split.group command. "); m->mothurOutEndLine(); abort = true; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; m->mothurOut("group is a required parameter for the split.group command. "); m->mothurOutEndLine(); abort = true; } + + groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { m->splitAtDash(groups, Groups); } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(groupfile); } + } + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "SplitAbundCommand"); + exit(1); + } +} +//********************************************************************************************************************** +void SplitGroupCommand::help(){ + try { + m->mothurOut("The split.group command reads a group file, and parses your fasta and names files by groups. \n"); + m->mothurOut("The split.group command parameters are fasta, name, group and groups.\n"); + m->mothurOut("The fasta and group parameters are required.\n"); + m->mothurOut("The groups parameter allows you to select groups to create files for. \n"); + m->mothurOut("For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files. \n"); + m->mothurOut("If you want .fasta and .names files for all groups, set groups=all. \n"); + m->mothurOut("The split.group command should be used in the following format: split.group(fasta=yourFasta, group=yourGroupFile).\n"); + m->mothurOut("Example: split.group(fasta=abrecovery.fasta, group=abrecovery.groups).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** +SplitGroupCommand::~SplitGroupCommand(){ } +//********************************************************************************************************************** +int SplitGroupCommand::execute(){ + try { + + if (abort == true) { return 0; } + + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + + SharedUtil util; util.setGroups(Groups, groupMap->namesOfGroups); + + if (namefile != "") { readNames(); } + splitFasta(); + + delete groupMap; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "execute"); + exit(1); + } +} +/**********************************************************************************************************************/ +int SplitGroupCommand::readNames() { + try { + //open input file + ifstream in; + m->openInputFile(namefile, in); + + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol, secondCol; + in >> firstCol >> secondCol; m->gobble(in); + + vector names; + m->splitAtComma(secondCol, names); + + nameMap[firstCol] = names; + } + in.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "readNames"); + exit(1); + } +} + +/**********************************************************************************************************************/ +int SplitGroupCommand::splitFasta() { + try { + + string filerootFasta = outputDir + m->getRootName(m->getSimpleName(fastafile)); + string filerootName = outputDir + m->getRootName(m->getSimpleName(namefile)); + ofstream* temp; + ofstream* temp2; + map filehandles; + map::iterator it3; + + for (int i=0; iopenOutputFile(filerootFasta + Groups[i] + ".fasta", *(filehandles[Groups[i]+"fasta"])); + outputNames.push_back(filerootFasta + Groups[i] + ".fasta"); + + if (namefile != "") { + temp2 = new ofstream; + filehandles[Groups[i]+"name"] = temp2; + m->openOutputFile(filerootName + Groups[i] + ".names", *(filehandles[Groups[i]+"name"])); + outputNames.push_back(filerootName + Groups[i] + ".names"); + } + } + + //open input file + ifstream in; + m->openInputFile(fastafile, in); + + while (!in.eof()) { + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + + itNames = nameMap.find(seq.getName()); + + if (itNames == nameMap.end()) { + if (namefile != "") { m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine(); } + else { //no names file given + string group = groupMap->getGroup(seq.getName()); + + if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want + seq.printSequence(*(filehandles[group+"fasta"])); + }else if(group == "not found") { + m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); + } + } + }else{ + vector names = itNames->second; + map group2Names; + map::iterator it; + + for (int i = 0; i < names.size(); i++) { //build strings for new group names file, will select rep below + string group = groupMap->getGroup(names[i]); + + if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want + it = group2Names.find(group); + if (it == group2Names.end()) { + group2Names[group] = names[i] + ","; + }else{ + group2Names[group] += names[i] + ","; + } + }else if(group == "not found") { + m->mothurOut(names[i] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); + } + } + + for (map::iterator itGroups = group2Names.begin(); itGroups != group2Names.end(); itGroups++) { + //edit names string, by grabbing the first guy as the rep and removing the last comma + string newNames = itGroups->second; + newNames = newNames.substr(0, newNames.length()-1); + string repName = ""; + + int pos = newNames.find_first_of(','); + if (pos == newNames.npos) { //only one sequence so it represents itself + repName = newNames; + }else{ + repName = newNames.substr(0, pos); + } + + *(filehandles[itGroups->first+"name"]) << repName << '\t' << newNames << endl; + *(filehandles[itGroups->first+"fasta"]) << ">" << repName << endl << seq.getAligned() << endl; + } + } + } + } + + in.close(); + + for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { + (*(filehandles[it3->first])).close(); + delete it3->second; + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "splitFasta"); + exit(1); + } +} +/**********************************************************************************************************************/ + diff --git a/splitgroupscommand.h b/splitgroupscommand.h new file mode 100644 index 0000000..f6e8760 --- /dev/null +++ b/splitgroupscommand.h @@ -0,0 +1,51 @@ +#ifndef SPLITGROUPSCOMMAND_H +#define SPLITGROUPSCOMMAND_H + +/* + * splitgroupscommand.h + * Mothur + * + * Created by westcott on 9/20/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + + +/* split.groups - given a group file, split sequences and names files in to separate files *.group1.fasta and .group1.names. */ + + +#include "command.hpp" +#include "groupmap.h" +#include "sequence.hpp" + +/***************************************************************************************/ + +class SplitGroupCommand : public Command { + +public: + SplitGroupCommand(string); + ~SplitGroupCommand(); + int execute(); + void help(); + + +private: + int readNames(); + int splitFasta(); + + vector outputNames; + map > nameMap; + map >::iterator itNames; + GroupMap* groupMap; + + string outputDir, namefile, groupfile, groups, fastafile; + vector Groups; + bool abort; +}; + +/***************************************************************************************/ + +#endif + + + -- 2.39.2