From d0939d3ab83988cc068f9ebe60596cf5decb65e5 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 6 Apr 2009 18:21:45 +0000 Subject: [PATCH] added get.oturep command --- Mothur.xcodeproj/project.pbxproj | 6 + binsequencecommand.cpp | 7 +- commandfactory.cpp | 2 + errorchecking.cpp | 18 ++- getoturepcommand.cpp | 237 +++++++++++++++++++++++++++++++ getoturepcommand.h | 56 ++++++++ helpcommand.cpp | 9 ++ validcommands.cpp | 1 + validparameter.cpp | 3 + 9 files changed, 336 insertions(+), 3 deletions(-) create mode 100644 getoturepcommand.cpp create mode 100644 getoturepcommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 619737b..b63f108 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */; }; 372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; }; 372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; }; 372E12ED0F264D320095CF7E /* commandfactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12EC0F264D320095CF7E /* commandfactory.cpp */; }; @@ -134,6 +135,8 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 370B88050F8A4EE4005AB382 /* getoturepcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getoturepcommand.h; sourceTree = ""; }; + 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getoturepcommand.cpp; sourceTree = ""; }; 372E126E0F26365B0095CF7E /* readotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readotucommand.h; sourceTree = ""; }; 372E126F0F26365B0095CF7E /* readotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readotucommand.cpp; sourceTree = ""; }; 372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = ""; }; @@ -575,6 +578,8 @@ A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */, A70B53A90F4CD7AD0064797E /* getlinecommand.h */, A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */, + 370B88050F8A4EE4005AB382 /* getoturepcommand.h */, + 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */, 375873F10F7D64800040F377 /* heatmapcommand.h */, 375873F00F7D64800040F377 /* heatmapcommand.cpp */, 37D927E40F21331F001D4494 /* helpcommand.h */, @@ -829,6 +834,7 @@ 37519AA10F810D0200FED5E8 /* venncommand.cpp in Sources */, 37519AB50F810FAE00FED5E8 /* venn.cpp in Sources */, 37C1D9730F86506E0059E3F0 /* binsequencecommand.cpp in Sources */, + 370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp index 4db41bf..5db2932 100644 --- a/binsequencecommand.cpp +++ b/binsequencecommand.cpp @@ -153,12 +153,15 @@ void BinSeqCommand::readNamesFile() { } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the BinSeqCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + cout << "Standard Error: " << e.what() << " has occurred in the BinSeqCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; exit(1); } catch(...) { - cout << "An unknown error has occurred in the BinSeqCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + cout << "An unknown error has occurred in the BinSeqCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; exit(1); } } //********************************************************************************************************************** + + + diff --git a/commandfactory.cpp b/commandfactory.cpp index 354b5a3..5155095 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -35,6 +35,7 @@ #include "mothur.h" #include "nocommands.h" #include "binsequencecommand.h" +#include "getoturepcommand.h" /***********************************************************/ @@ -81,6 +82,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "heatmap") { command = new HeatMapCommand(); } else if(commandName == "venn") { command = new VennCommand(); } else if(commandName == "bin.seqs") { command = new BinSeqCommand(); } + else if(commandName == "get.oturep") { command = new GetOTURepCommand(); } else { command = new NoCommand(); } return command; diff --git a/errorchecking.cpp b/errorchecking.cpp index da1d1bb..abd2a9f 100644 --- a/errorchecking.cpp +++ b/errorchecking.cpp @@ -233,6 +233,16 @@ bool ErrorCheck::checkInput(string input) { if ((globaldata->getListFile() == "")) { cout << "You must read a list file before you can use the bin.seqs command." << endl; return false; } validateBinFiles(); } + + if ((commandName == "get.oturep")) { + if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) { + cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl; + errorFree = false; + } + if (listfile == "") { cout << "list is a required parameter for the get.oturep command." << endl; errorFree = false; } + validateBinFiles(); + } + return errorFree; } @@ -484,13 +494,19 @@ void ErrorCheck::validateBinFiles() { int ableToOpen; if (fastafile == "") { - cout << "fasta is a required parameter for bin.seqs." << endl; errorFree = false; + cout << "fasta is a required parameter for bin.seqs and get.oturep commands." << endl; errorFree = false; }else if (fastafile != "") { //is it a valid filename' ableToOpen = openInputFile(fastafile, filehandle); filehandle.close(); //unable to open if (ableToOpen == 1) { errorFree = false; } + }else if (listfile != "") { + //is it a valid filename' + ableToOpen = openInputFile(listfile, filehandle); + filehandle.close(); + //unable to open + if (ableToOpen == 1) { errorFree = false; } }else if (globaldata->getNameFile() != "") { //is it a valid filename' ifstream filehandle; diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp new file mode 100644 index 0000000..ea53f25 --- /dev/null +++ b/getoturepcommand.cpp @@ -0,0 +1,237 @@ +/* + * getoturepcommand.cpp + * Mothur + * + * Created by Sarah Westcott on 4/6/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "getoturepcommand.h" + +//********************************************************************************************************************** +GetOTURepCommand::GetOTURepCommand(){ + try{ + globaldata = GlobalData::getInstance(); + + if(globaldata->gSparseMatrix != NULL) { matrix = new SparseMatrix(*globaldata->gSparseMatrix); } + + //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix + if(globaldata->gListVector != NULL) { + listOfNames = new ListVector(*globaldata->gListVector); + + //map names to rows in sparsematrix + for (int i = 0; i < listOfNames->size(); i++) { + nameToIndex[listOfNames->get(i)] = i; + } + }else { cout << "error" << endl; } + + + fastafile = globaldata->getFastaFile(); + namesfile = globaldata->getNameFile(); + openInputFile(fastafile, in); + + fasta = new FastaMap(); + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +//********************************************************************************************************************** + +GetOTURepCommand::~GetOTURepCommand(){ + delete matrix; + delete list; + delete input; + delete read; + delete fasta; +} + +//********************************************************************************************************************** + +int GetOTURepCommand::execute(){ + try { + int count = 1; + string nameRep, name, sequence; + + //read fastafile + fasta->readFastaFile(in); + + //set format to list so input can get listvector + globaldata->setFormat("list"); + + //if user gave a namesfile then use it + if (namesfile != "") { + readNamesFile(); + } + + //read list file + read = new ReadPhilFile(globaldata->getListFile()); + read->read(&*globaldata); + + input = globaldata->ginput; + list = globaldata->gListVector; + + while(list != NULL){ + + if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){ + + //create output file + string outputFileName = getRootName(globaldata->getListFile()) + list->getLabel() + ".fastarep"; + openOutputFile(outputFileName, out); + + cout << list->getLabel() << '\t' << count << endl; + + //for each bin in the list vector + for (int i = 0; i < list->size(); i++) { + nameRep = FindRep(i); + + //print out name and sequence for that bin + sequence = fasta->getSequence(nameRep); + + if (sequence != "not found") { + nameRep = nameRep + "bin" + toString(i+1); + out << ">" << nameRep << endl; + out << sequence << endl; + }else { + cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; + remove(outputFileName.c_str()); + return 0; + } + } + } + + list = input->getListVector(); + count++; + } + + + return 0; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} + +//********************************************************************************************************************** +void GetOTURepCommand::readNamesFile() { + try { + vector dupNames; + openInputFile(namesfile, inNames); + + string name, names, sequence; + + while(inNames){ + inNames >> name; //read from first column A + inNames >> names; //read from second column A,B,C,D + + dupNames.clear(); + + //parse names into vector + splitAtComma(names, dupNames); + + //store names in fasta map + sequence = fasta->getSequence(name); + for (int i = 0; i < dupNames.size(); i++) { + fasta->push_back(dupNames[i], sequence); + } + + gobble(inNames); + } + inNames.close(); + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +//********************************************************************************************************************** +string GetOTURepCommand::FindRep(int bin) { + try{ + vector names; + map sums; + map::iterator it4; + map binMap; //subset of namesToIndex - just member of this bin + string binnames; + float min = 10000; + string minName; + + binnames = list->get(bin); + + //parse names into vector + splitAtComma(binnames, names); + + //if only 1 sequence in bin then that's the rep + if (names.size() == 1) { return names[0]; } + else { + //fill binMap + for (int i = 0; i < names.size(); i++) { + for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) { + if (it3->first == names[i]) { + binMap[it3->second] = it3->first; + + //initialize sums map + sums[it3->first] = 0.0; + break; + } + } + } + + //go through each cell in the sparsematrix + for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){ + //is this a distance between 2 members of this bin + it = binMap.find(currentCell->row); + it2 = binMap.find(currentCell->column); + + //sum the distance of the sequences in the bin to eachother + if ((it != binMap.end()) && (it2 != binMap.end())) { + //this is a cell that repesents the distance between to of this bins members + sums[it->second] += currentCell->dist; + sums[it2->second] += currentCell->dist; + } + } + + //smallest sum is the representative + for (it4 = sums.begin(); it4 != sums.end(); it4++) { + if (it4->second < min) { + min = it4->second; + minName = it4->first; + } + + } + + return minName; + } + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + + + + + diff --git a/getoturepcommand.h b/getoturepcommand.h new file mode 100644 index 0000000..d0343da --- /dev/null +++ b/getoturepcommand.h @@ -0,0 +1,56 @@ +#ifndef GETOTUREPCOMMAND_H +#define GETOTUREPCOMMAND_H +/* + * getoturepcommand.h + * Mothur + * + * Created by Sarah Westcott on 4/6/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "command.hpp" +#include "globaldata.hpp" +#include "sparsematrix.hpp" +#include "listvector.hpp" +#include "inputdata.h" +#include "readmatrix.hpp" +#include "fastamap.h" + + +class GlobalData; + +typedef list::iterator MatData; + +class GetOTURepCommand : public Command { + +public: + GetOTURepCommand(); + ~GetOTURepCommand(); + int execute(); + +private: + GlobalData* globaldata; + SparseMatrix* matrix; + ListVector* list; + ListVector* listOfNames; + ReadMatrix* read; + InputData* input; + FastaMap* fasta; + string filename, fastafile, namesfile; + ofstream out; + ifstream in, inNames; + + + map nameToIndex; //maps sequence name to index in sparsematrix + map::iterator it; + map::iterator it2; + map::iterator it3; + + void readNamesFile(); + string FindRep(int); // returns name of "representative" sequence of given bin. + +}; + +#endif + diff --git a/helpcommand.cpp b/helpcommand.cpp index 9f5b415..8e28cde 100644 --- a/helpcommand.cpp +++ b/helpcommand.cpp @@ -218,6 +218,15 @@ int HelpCommand::execute(){ cout << "The default value for line and label are all lines in your inputfile." << "\n"; cout << "The bin.seqs command outputs a .fasta file for each distance you specify appending the OTU number to each name." << "\n"; cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n"; + }else if (globaldata->helpRequest == "get.oturep") { + cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n"; + cout << "The get.oturep command parameters are list, fasta, name, line and label. The fasta and list parameters are required, and you may not use line and label at the same time." << "\n"; + cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n"; + cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, line=yourLines, label=yourLabels)." << "\n"; + cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, line=1-3-5, name=amazon.names)." << "\n"; + cout << "The default value for line and label are all lines in your inputfile." << "\n"; + cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n"; + cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n"; }else if (globaldata->helpRequest == "quit") { cout << "The quit command will terminate Dotur and should be in the following format: " << "\n"; cout << "quit()" << "\n" << "\n"; diff --git a/validcommands.cpp b/validcommands.cpp index 2adbc0e..fed10ca 100644 --- a/validcommands.cpp +++ b/validcommands.cpp @@ -18,6 +18,7 @@ ValidCommands::ValidCommands() { commands["read.otu"] = "read.otu"; commands["read.tree"] = "read.tree"; commands["bin.seqs"] = "bin.seqs"; + commands["get.oturep"] = "get.oturep"; commands["cluster"] = "cluster"; commands["deconvolute"] = "deconvolute"; commands["collect.single"] = "collect.single"; diff --git a/validparameter.cpp b/validparameter.cpp index 5278d51..cb6db16 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -292,6 +292,9 @@ void ValidParameters::initCommandParameters() { string binseqsArray[] = {"fasta","line","label","name"}; commandParameters["bin.seqs"] = addParameters(binseqsArray, sizeof(binseqsArray)/sizeof(string)); + string getOTURepArray[] = {"fasta","list","line","label","name"}; + commandParameters["get.oturep"] = addParameters(getOTURepArray, sizeof(getOTURepArray)/sizeof(string)); + string quitArray[] = {}; commandParameters["quit"] = addParameters(quitArray, sizeof(quitArray)/sizeof(string)); -- 2.39.2