From: westcott Date: Mon, 29 Nov 2010 14:21:39 +0000 (+0000) Subject: added consensus.seqs command X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3fd6dd6e4f19a458ac2966ee5458787e998a1bde;p=mothur.git added consensus.seqs command --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 7d9b315..86fdbc3 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -37,6 +37,8 @@ A71D924511AEB42400D00CBC /* splitabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = splitabundcommand.h; sourceTree = ""; }; A71D924611AEB42400D00CBC /* splitmatrix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = splitmatrix.cpp; sourceTree = ""; }; A71D924711AEB42400D00CBC /* splitmatrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = splitmatrix.h; sourceTree = ""; }; + A71FF438129BDC35004397E6 /* consensusseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = consensusseqscommand.h; sourceTree = ""; }; + A71FF439129BDC35004397E6 /* consensusseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = consensusseqscommand.cpp; sourceTree = ""; }; A724C2B61254961E006BB1C7 /* parsefastaqcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsefastaqcommand.h; sourceTree = ""; }; A724C2B71254961E006BB1C7 /* parsefastaqcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsefastaqcommand.cpp; sourceTree = ""; }; A72B3A62118B37FD004B9F8D /* phylodiversitycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversitycommand.h; sourceTree = ""; }; @@ -792,6 +794,8 @@ A7DA2029113FECD400BF472F /* collectsharedcommand.cpp */, A7DA2032113FECD400BF472F /* consensuscommand.h */, A7DA2031113FECD400BF472F /* consensuscommand.cpp */, + A71FF438129BDC35004397E6 /* consensusseqscommand.h */, + A71FF439129BDC35004397E6 /* consensusseqscommand.cpp */, A7118EE511CFCAC000CFDE03 /* degapseqscommand.h */, A7118EE611CFCAC000CFDE03 /* degapseqscommand.cpp */, A7DA203B113FECD400BF472F /* deconvolutecommand.h */, diff --git a/catchallcommand.cpp b/catchallcommand.cpp index 7f740bd..8f28413 100644 --- a/catchallcommand.cpp +++ b/catchallcommand.cpp @@ -26,8 +26,8 @@ vector CatchAllCommand::getValidParameters(){ CatchAllCommand::CatchAllCommand(){ try { //initialize outputTypes - - //need to determine outputFIles and types + vector tempOutNames; + outputTypes["csv"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "CatchAllCommand", "CatchAllCommand"); @@ -83,10 +83,9 @@ CatchAllCommand::CatchAllCommand(string option) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } - //initialize outputTypes - //need to determine outputFIles and types - + vector tempOutNames; + outputTypes["csv"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -199,10 +198,10 @@ int CatchAllCommand::execute() { filename = m->getRootName(filename); filename = filename.substr(0, filename.length()-1); //rip off extra . - outputNames.push_back(filename + "_Analysis.csv"); - outputNames.push_back(filename + "_BestModelsAnalysis.csv"); - outputNames.push_back(filename + "_BestModelsFits.csv"); - outputNames.push_back(filename + "_BubblePlot.csv"); + outputNames.push_back(filename + "_Analysis.csv"); outputTypes["csv"].push_back(filename + "_Analysis.csv"); + outputNames.push_back(filename + "_BestModelsAnalysis.csv"); outputTypes["csv"].push_back(filename + "_BestModelsAnalysis.csv"); + outputNames.push_back(filename + "_BestModelsFits.csv"); outputTypes["csv"].push_back(filename + "_BestModelsFits.csv"); + outputNames.push_back(filename + "_BubblePlot.csv"); outputTypes["csv"].push_back(filename + "_BubblePlot.csv"); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {remove(outputNames[i].c_str()); } delete read; delete input; globaldata->ginput = NULL; delete sabund; return 0; } @@ -234,11 +233,11 @@ int CatchAllCommand::execute() { filename = m->getRootName(filename); filename = filename.substr(0, filename.length()-1); //rip off extra . - outputNames.push_back(filename + "_Analysis.csv"); - outputNames.push_back(filename + "_BestModelsAnalysis.csv"); - outputNames.push_back(filename + "_BestModelsFits.csv"); - outputNames.push_back(filename + "_BubblePlot.csv"); - + outputNames.push_back(filename + "_Analysis.csv"); outputTypes["csv"].push_back(filename + "_Analysis.csv"); + outputNames.push_back(filename + "_BestModelsAnalysis.csv"); outputTypes["csv"].push_back(filename + "_BestModelsAnalysis.csv"); + outputNames.push_back(filename + "_BestModelsFits.csv"); outputTypes["csv"].push_back(filename + "_BestModelsFits.csv"); + outputNames.push_back(filename + "_BubblePlot.csv"); outputTypes["csv"].push_back(filename + "_BubblePlot.csv"); + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {remove(outputNames[i].c_str()); } delete read; delete input; globaldata->ginput = NULL; delete sabund; return 0; } processedLabels.insert(sabund->getLabel()); @@ -289,11 +288,10 @@ int CatchAllCommand::execute() { filename = m->getRootName(filename); filename = filename.substr(0, filename.length()-1); //rip off extra . - outputNames.push_back(filename + "_Analysis.csv"); - outputNames.push_back(filename + "_BestModelsAnalysis.csv"); - outputNames.push_back(filename + "_BestModelsFits.csv"); - outputNames.push_back(filename + "_BubblePlot.csv"); - + outputNames.push_back(filename + "_Analysis.csv"); outputTypes["csv"].push_back(filename + "_Analysis.csv"); + outputNames.push_back(filename + "_BestModelsAnalysis.csv"); outputTypes["csv"].push_back(filename + "_BestModelsAnalysis.csv"); + outputNames.push_back(filename + "_BestModelsFits.csv"); outputTypes["csv"].push_back(filename + "_BestModelsFits.csv"); + outputNames.push_back(filename + "_BubblePlot.csv"); outputTypes["csv"].push_back(filename + "_BubblePlot.csv"); delete sabund; } diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 3876c6c..2d38387 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -559,16 +559,6 @@ int ClassifySeqsCommand::execute(){ numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); - rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); - - for(int i=1;icontrol_pressed) { return 0; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); - + if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); @@ -863,6 +863,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT //report progress if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + } //report progress if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 85a183a..85c70f0 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -37,6 +37,7 @@ ClusterSplitCommand::ClusterSplitCommand(){ outputTypes["list"] = tempOutNames; outputTypes["rabund"] = tempOutNames; outputTypes["sabund"] = tempOutNames; + outputTypes["column"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand"); @@ -100,6 +101,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["rabund"] = tempOutNames; outputTypes["sabund"] = tempOutNames; + outputTypes["column"] = tempOutNames; globaldata->newRead(); @@ -360,6 +362,10 @@ int ClusterSplitCommand::execute(){ vector< map > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size delete split; + //output a merged distance file + if (splitmethod == "fasta") { createMergedDistanceFile(distName); } + + if (m->control_pressed) { return 0; } m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine(); @@ -1077,5 +1083,45 @@ vector ClusterSplitCommand::cluster(vector< map > distNa } +//********************************************************************************************************************** +int ClusterSplitCommand::createMergedDistanceFile(vector< map > distNames) { + try{ + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid != 0) { +#endif + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist"; + remove(outputFileName.c_str()); + + + for (int i = 0; i < distNames.size(); i++) { + if (m->control_pressed) { return 0; } + + string thisDistFile = distNames[i].begin()->first; + + m->appendFiles(thisDistFile, outputFileName); + } + + outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName); + +#ifdef USE_MPI + } +#endif + + + + + } + catch(exception& e) { + m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/clustersplitcommand.h b/clustersplitcommand.h index 07ddaf9..9bce099 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -50,6 +50,7 @@ private: vector cluster(vector< map >, set&); int mergeLists(vector, map, ListVector*); map completeListFile(vector, string, set&, ListVector*&); + int createMergedDistanceFile(vector< map >); }; #endif diff --git a/commandfactory.cpp b/commandfactory.cpp index 25d6cfe..b7b4992 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -102,6 +102,7 @@ #include "getotuscommand.h" #include "removeotuscommand.h" #include "indicatorcommand.h" +#include "consensusseqscommand.h" /*******************************************************/ @@ -208,6 +209,7 @@ CommandFactory::CommandFactory(){ commands["get.otus"] = "get.otus"; commands["remove.otus"] = "remove.otus"; commands["indicator"] = "indicator"; + commands["consensus.seqs"] = "consensus.seqs"; commands["pairwise.seqs"] = "MPIEnabled"; commands["pipeline.pds"] = "MPIEnabled"; commands["classify.seqs"] = "MPIEnabled"; @@ -361,6 +363,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "cluster.classic") { command = new ClusterDoturCommand(optionString); } else if(commandName == "sub.sample") { command = new SubSampleCommand(optionString); } else if(commandName == "indicator") { command = new IndicatorCommand(optionString); } + else if(commandName == "consensus.seqs") { command = new ConsensusSeqsCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -481,6 +484,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "cluster.classic") { pipecommand = new ClusterDoturCommand(optionString); } else if(commandName == "sub.sample") { pipecommand = new SubSampleCommand(optionString); } else if(commandName == "indicator") { pipecommand = new IndicatorCommand(optionString); } + else if(commandName == "consensus.seqs") { pipecommand = new ConsensusSeqsCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -589,6 +593,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "cluster.classic") { shellcommand = new ClusterDoturCommand(); } else if(commandName == "sub.sample") { shellcommand = new SubSampleCommand(); } else if(commandName == "indicator") { shellcommand = new IndicatorCommand(); } + else if(commandName == "consensus.seqs") { shellcommand = new ConsensusSeqsCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp new file mode 100644 index 0000000..84519b8 --- /dev/null +++ b/consensusseqscommand.cpp @@ -0,0 +1,626 @@ +/* + * consensusseqscommand.cpp + * Mothur + * + * Created by westcott on 11/23/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "consensusseqscommand.h" +#include "sequence.hpp" +#include "inputdata.h" + +//********************************************************************************************************************** +vector ConsensusSeqsCommand::getValidParameters(){ + try { + string Array[] = {"fasta", "list", "name", "label","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "getValidParameters"); + exit(1); + } +} +//********************************************************************************************************************** +ConsensusSeqsCommand::ConsensusSeqsCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["summary"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "ConsensusSeqsCommand"); + exit(1); + } +} +//********************************************************************************************************************** +vector ConsensusSeqsCommand::getRequiredParameters(){ + try { + string Array[] = {"fasta", "list"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "getRequiredParameters"); + exit(1); + } +} +//********************************************************************************************************************** +vector ConsensusSeqsCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "DegapSeqsCommand", "getRequiredFiles"); + exit(1); + } +} +//*************************************************************************************************************** +ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { + try { + abort = false; + allLines = 1; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","list","name","label", "outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["summary"] = tempOutNames; + + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + } + + + //check for parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the consensus.seqs command."); m->mothurOutEndLine(); abort = true; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not open") { abort = true; } + else if (listfile == "not found") { listfile = ""; m->mothurOut("list is a required parameter for the consensus.seqs command."); m->mothurOutEndLine(); abort = true; } + + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } + + } + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "ConsensusSeqsCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ConsensusSeqsCommand::help(){ + try { + m->mothurOut("The consensus.seqs command reads a fastafile and listfile and creates a consensus sequence for each otu. Sequences must be aligned.\n"); + m->mothurOut("The consensus.seqs command parameters are fasta, list, name and label.\n"); + m->mothurOut("The fasta parameter allows you to enter the fasta file containing your sequences, and is required. \n"); + m->mothurOut("The list parameter allows you to enter a your list file. \n"); + m->mothurOut("The name parameter allows you to enter a names file associated with the fasta file. \n"); + m->mothurOut("The label parameter allows you to select what distance levels you would like output files for, and are separated by dashes.\n"); + m->mothurOut("The consensus.seqs command should be in the following format: \n"); + m->mothurOut("consensus.seqs(fasta=yourFastaFile, list=yourListFile) \n"); + m->mothurOut("Example: consensus.seqs(fasta=abrecovery.align, list=abrecovery.fn.list) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ConsensusSeqsCommand::~ConsensusSeqsCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ConsensusSeqsCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + readFasta(); + + if (m->control_pressed) { return 0; } + + if (namefile != "") { readNames(); } + + if (m->control_pressed) { return 0; } + + InputData* input = new InputData(listfile, "list"); + ListVector* list = input->getListVector(); + + string lastLabel = list->getLabel(); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete list; delete input; return 0; } + + if(allLines == 1 || labels.count(list->getLabel()) == 1){ + + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processList(list); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + + list = input->getListVector(lastLabel); + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processList(list); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + } + + lastLabel = list->getLabel(); + + delete list; list = NULL; + + //get next line to process + list = input->getListVector(); + } + + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } if (list != NULL) { delete list; } delete input; return 0; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + + list = input->getListVector(lastLabel); + + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processList(list); + + delete list; list = NULL; + } + + if (list != NULL) { delete list; } + delete input; + + m->mothurOutEndLine(); + m->mothurOut("Output File Name: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "execute"); + exit(1); + } +} +//*************************************************************************************************************** + +int ConsensusSeqsCommand::processList(ListVector*& list){ + try{ + + ofstream outSummary; + string outputSummaryFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + list->getLabel() + ".cons.summary"; + m->openOutputFile(outputSummaryFile, outSummary); + outSummary.setf(ios::fixed, ios::floatfield); outSummary.setf(ios::showpoint); + outputNames.push_back(outputSummaryFile); outputTypes["summary"].push_back(outputSummaryFile); + + ofstream outName; + string outputNameFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + list->getLabel() + ".cons.names"; + m->openOutputFile(outputNameFile, outName); + outputNames.push_back(outputNameFile); outputTypes["name"].push_back(outputNameFile); + + ofstream outFasta; + string outputFastaFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + list->getLabel() + ".cons.fasta"; + m->openOutputFile(outputFastaFile, outFasta); + outputNames.push_back(outputFastaFile); outputTypes["fasta"].push_back(outputFastaFile); + + for (int i = 0; i < list->getNumBins(); i++) { + + if (m->control_pressed) { outSummary.close(); outName.close(); outFasta.close(); return 0; } + + string bin = list->get(i); + + string newName = ""; + string consSeq = getConsSeq(bin, outSummary, newName, i); + + outFasta << ">seq" << (i+1) << endl << consSeq << endl; + outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << newName << endl; + } + + outSummary.close(); outName.close(); outFasta.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "processList"); + exit(1); + } +} + +//*************************************************************************************************************** +//made this smart enough to owrk with unique or non unique list file +string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, string& name, int binNumber){ + try{ + + string consSeq = ""; + bool error = false; + + //the whole bin is the second column if no names file, otherwise build it + name = bin; + if (namefile != "") { name = ""; } + + vector binNames; + m->splitAtComma(bin, binNames); + + //get sequence strings for each name in the bin + vector seqs; + + set addedAlready; + int seqLength = 0; + for (int i = 0; i < binNames.size(); i++) { + + map::iterator it; + + it = nameMap.find(binNames[i]); + if (it == nameMap.end()) { + if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; } + else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; } + break; + }else { + + //add sequence string to seqs vector to process below + string seq = fastaMap[it->second]; + seqs.push_back(seq); + + if (seqLength == 0) { seqLength = seq.length(); } + else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); error = true; break; } + + if (namefile != "") { + //did we add this line from name file already? + if (addedAlready.count(it->second) == 0) { + name += "," + nameFileMap[it->second]; + addedAlready.insert(it->second); + } + } + + } + } + + if (error) { m->control_pressed = true; return consSeq; } + + if (namefile != "") { name = name.substr(1); } + + vector< vector > percentages; percentages.resize(5); + for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); } + + //get counts + for (int j = 0; j < seqLength; j++) { + + if (m->control_pressed) { return consSeq; } + + vector counts; counts.resize(5, 0); //A,T,G,C,Gap + int numDots = 0; + + for (int i = 0; i < seqs.size(); i++) { + + if (seqs[i][j] == '.') { numDots++; } + + char base = toupper(seqs[i][j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + } + + char conBase = '.'; + if (numDots != seqs.size()) { conBase = getBase(counts); } + + consSeq += conBase; + + percentages[0][j] = counts[0] / (float) seqs.size(); + percentages[1][j] = counts[1] / (float) seqs.size(); + percentages[2][j] = counts[2] / (float) seqs.size(); + percentages[3][j] = counts[3] / (float) seqs.size(); + percentages[4][j] = counts[4] / (float) seqs.size(); + + } + + outSummary << ">seq" << (binNumber + 1) << endl; + outSummary << "A" << '\t'; + for (int j = 0; j < seqLength; j++) { outSummary << percentages[0][j] << '\t'; } + outSummary << endl; + outSummary << "T" << '\t'; + for (int j = 0; j < seqLength; j++) { outSummary << percentages[1][j] << '\t'; } + outSummary << endl; + outSummary << "G" << '\t'; + for (int j = 0; j < seqLength; j++) { outSummary << percentages[2][j] << '\t'; } + outSummary << endl; + outSummary << "C" << '\t'; + for (int j = 0; j < seqLength; j++) { outSummary << percentages[3][j] << '\t'; } + outSummary << endl; + outSummary << "Gap" << '\t'; + for (int j = 0; j < seqLength; j++) { outSummary << percentages[4][j] << '\t'; } + outSummary << endl; + + + return consSeq; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "getConsSeq"); + exit(1); + } +} +//*************************************************************************************************************** + +char ConsensusSeqsCommand::getBase(vector counts){ //A,T,G,C,Gap + try{ + /* A = adenine + * C = cytosine + * G = guanine + * T = thymine + * R = G A (purine) + * Y = T C (pyrimidine) + * K = G T (keto) + * M = A C (amino) + * S = G C (strong bonds) + * W = A T (weak bonds) + * B = G T C (all but A) + * D = G A T (all but C) + * H = A C T (all but G) + * V = G C A (all but T) + * N = A G C T (any) */ + + char conBase = 'N'; + + //any + if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'n'; } + //any no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'N'; } + //all but T + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'v'; } + //all but T no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'V'; } + //all but G + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'h'; } + //all but G no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'H'; } + //all but C + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'd'; } + //all but C no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'D'; } + //all but A + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'b'; } + //all but A no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'B'; } + //W = A T (weak bonds) + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'w'; } + //W = A T (weak bonds) no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'W'; } + //S = G C (strong bonds) + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 's'; } + //S = G C (strong bonds) no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'S'; } + //M = A C (amino) + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'm'; } + //M = A C (amino) no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'M'; } + //K = G T (keto) + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'k'; } + //K = G T (keto) no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'K'; } + //Y = T C (pyrimidine) + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'y'; } + //Y = T C (pyrimidine) no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'Y'; } + //R = G A (purine) + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'r'; } + //R = G A (purine) no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'R'; } + //only A + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'a'; } + //only A no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'A'; } + //only T + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 't'; } + //only T no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'T'; } + //only G + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'g'; } + //only G no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'G'; } + //only C + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'c'; } + //only C no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'C'; } + //only gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = '-'; } + else{ m->mothurOut("[ERROR]: cannot find consensus base."); m->mothurOutEndLine(); } + + return conBase; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "getBase"); + exit(1); + } +} + +//*************************************************************************************************************** + +int ConsensusSeqsCommand::readFasta(){ + try{ + + ifstream in; + m->openInputFile(fastafile, in); + + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + string name = seq.getName(); + + if (name != "") { + fastaMap[name] = seq.getAligned(); + nameMap[name] = name; //set nameMap incase no names file + nameFileMap[name] = name; + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "readFasta"); + exit(1); + } +} +//*************************************************************************************************************** + +int ConsensusSeqsCommand::readNames(){ + try{ + + ifstream in; + m->openInputFile(namefile, in); + + string thisname, repnames; + map::iterator it; + + bool error = false; + + while(!in.eof()){ + + if (m->control_pressed) { break; } + + in >> thisname; m->gobble(in); //read from first column + in >> repnames; //read from second column + + it = nameMap.find(thisname); + if (it != nameMap.end()) { //then this sequence was in the fastafile + + vector splitRepNames; + m->splitAtComma(repnames, splitRepNames); + + nameFileMap[thisname] = repnames; //for later when outputting the new namesFile if the list file is unique + for (int i = 0; i < splitRepNames.size(); i++) { nameMap[splitRepNames[i]] = thisname; } + + }else{ m->mothurOut("[ERROR]: " + thisname + " is not in the fasta file, please correct."); m->mothurOutEndLine(); error = true; } + + m->gobble(in); + } + + in.close(); + + if (error) { m->control_pressed = true; } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ConsensusSeqsCommand", "readNames"); + exit(1); + } + } + +//*************************************************************************************************************** + + diff --git a/consensusseqscommand.h b/consensusseqscommand.h new file mode 100644 index 0000000..555b62c --- /dev/null +++ b/consensusseqscommand.h @@ -0,0 +1,51 @@ +#ifndef CONSENSUSSEQSCOMMAND_H +#define CONSENSUSSEQSCOMMAND_H + +/* + * consensusseqscommand.h + * Mothur + * + * Created by westcott on 11/23/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + + +#include "command.hpp" +#include "listvector.hpp" + +class ConsensusSeqsCommand : public Command { +public: + ConsensusSeqsCommand(string); + ConsensusSeqsCommand(); + ~ConsensusSeqsCommand(); + vector getRequiredParameters(); + vector getValidParameters(); + vector getRequiredFiles(); + map > getOutputFiles() { return outputTypes; } + int execute(); + void help(); + +private: + + bool abort, allLines; + string fastafile, listfile, namefile, label, outputDir; + set labels; + vector outputNames; + map > outputTypes; + map fastaMap; + map nameMap; + map nameFileMap; + + int readFasta(); + int readNames(); + int processList(ListVector*&); + string getConsSeq(string, ofstream&, string&, int); + char getBase(vector); +}; + +#endif + + + + diff --git a/makefile b/makefile index 1b8e046..36d0d77 100644 --- a/makefile +++ b/makefile @@ -89,6 +89,8 @@ OBJECTS+=$(patsubst %.c,%.o,$(wildcard *.c)) mothur : $(OBJECTS) $(CXX) $(LDFLAGS) $(TARGET_ARCH) -o $@ $(OBJECTS) $(LIBS) + + strip mothur install : mothur cp mothur ../Release/mothur diff --git a/mothur b/mothur index d83c006..6bdaec6 100755 Binary files a/mothur and b/mothur differ