From: westcott Date: Tue, 19 Jan 2010 18:15:34 +0000 (+0000) Subject: added out.hierarchy command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=ee4dd201fa4f2c4ede5b2e525c82cce0a37de363 added out.hierarchy command --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 8e21f64..a44e1f3 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -159,6 +159,7 @@ 7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; }; 8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; }; A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */; }; + A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A704E8A21106045D00870157 /* otuhierarchycommand.cpp */; }; A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; }; A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; }; A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; @@ -532,6 +533,8 @@ 8DD76F6C0486A84900D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; }; A7027F2610DFF86D00BF45FE /* preclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = preclustercommand.h; sourceTree = SOURCE_ROOT; }; A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = preclustercommand.cpp; sourceTree = SOURCE_ROOT; }; + A704E8A11106045D00870157 /* otuhierarchycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = otuhierarchycommand.h; sourceTree = SOURCE_ROOT; }; + A704E8A21106045D00870157 /* otuhierarchycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = otuhierarchycommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getgroupcommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A50F4CD7AD0064797E /* getgroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getgroupcommand.h; sourceTree = SOURCE_ROOT; }; A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlabelcommand.cpp; sourceTree = SOURCE_ROOT; }; @@ -945,6 +948,8 @@ A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */, 375873F70F7D649C0040F377 /* nocommands.h */, 375873F60F7D649C0040F377 /* nocommands.cpp */, + A704E8A11106045D00870157 /* otuhierarchycommand.h */, + A704E8A21106045D00870157 /* otuhierarchycommand.cpp */, 3792946E0F2E191800B9034A /* parsimonycommand.h */, 3792946F0F2E191800B9034A /* parsimonycommand.cpp */, A7D176CD10F2558500159497 /* pcacommand.h */, @@ -1325,6 +1330,7 @@ A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */, A7D86C7D10FE09AB00865661 /* formatcolumn.cpp in Sources */, A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */, + A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/collectsharedcommand.cpp b/collectsharedcommand.cpp index 4254950..e68890c 100644 --- a/collectsharedcommand.cpp +++ b/collectsharedcommand.cpp @@ -103,7 +103,7 @@ CollectSharedCommand::CollectSharedCommand(string option){ temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; } convert(temp, freq); - temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "true"; } + temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; } all = isTrue(temp); if (abort == false) { @@ -184,7 +184,7 @@ void CollectSharedCommand::help(){ mothurOut("The default value for groups is all the groups in your groupfile.\n"); validCalculator->printCalc("shared", cout); mothurOut("The label parameter is used to analyze specific labels in your input.\n"); - mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n"); + mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n"); mothurOut("If you use sharedchao and run into memory issues, set all to false. \n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"); mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n"); diff --git a/commandfactory.cpp b/commandfactory.cpp index 2417d8d..0da5986 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -63,6 +63,7 @@ #include "mgclustercommand.h" #include "preclustercommand.h" #include "pcacommand.h" +#include "otuhierarchycommand.h" /*******************************************************/ @@ -135,6 +136,7 @@ CommandFactory::CommandFactory(){ commands["mgcluster"] = "mgcluster"; commands["pre.cluster"] = "pre.cluster"; commands["pca"] = "pca"; + commands["otu.hierarchy"] = "otu.hierarchy"; } /***********************************************************/ @@ -205,6 +207,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); } else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); } else if(commandName == "pca") { command = new PCACommand(optionString); } + else if(commandName == "otu.hierarchy") { command = new OtuHierarchyCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/hcluster.cpp b/hcluster.cpp index 259036f..db93785 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -503,18 +503,21 @@ vector HCluster::getSeqsAN(){ /***********************************************************************/ void HCluster::combineFile() { try { - int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware - char* inputBuffer; - inputBuffer = new char[bufferSize]; - size_t numRead; + //int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware + //char* inputBuffer; + //inputBuffer = new char[bufferSize]; + //size_t numRead; string tempDistFile = distfile + ".temp"; ofstream out; openOutputFile(tempDistFile, out); - FILE* in; - in = fopen(distfile.c_str(), "rb"); + //FILE* in; + //in = fopen(distfile.c_str(), "rb"); + ifstream in; + openInputFile(distfile, in); + int first, second; float dist; @@ -524,28 +527,30 @@ void HCluster::combineFile() { //go through file pulling out distances related to rows merging //if mergedMin contains distances add those back into file - bool done = false; - partialDist = ""; - while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) { + //bool done = false; + //partialDist = ""; + //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) { //cout << "number of char read = " << numRead << endl; //cout << inputBuffer << endl; - if (numRead < bufferSize) { done = true; } + //if (numRead < bufferSize) { done = true; } //parse input into individual distances - int spot = 0; - string outputString = ""; - while(spot < numRead) { + //int spot = 0; + //string outputString = ""; + //while(spot < numRead) { //cout << "spot = " << spot << endl; - seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize); + // seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize); //you read a partial distance - if (nextDist.seq1 == -1) { break; } - - first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist; + // if (nextDist.seq1 == -1) { break; } + while (!in.eof()) { + //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist; //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl; //since file is sorted and mergedMin is sorted //you can put the smallest distance from each through the code below and keep the file sorted + in >> first >> second >> dist; gobble(in); + //while there are still values in mergedMin that are smaller than the distance read from file while (count < mergedMin.size()) { @@ -563,7 +568,8 @@ void HCluster::combineFile() { }else if (mergedMin[count].seq2 == smallRow) { smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; }else { //if no, write to temp file - outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; + //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; } count++; }else{ break; } @@ -582,14 +588,16 @@ void HCluster::combineFile() { smallRowColValues[0][first] = dist; }else { //if no, write to temp file - outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; + //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; + out << first << '\t' << second << '\t' << dist << endl; } } - out << outputString; - if(done) { break; } - } - fclose(in); + //out << outputString; + //if(done) { break; } + //} + //fclose(in); + in.close(); //if values in mergedMin are larger than the the largest in file then while (count < mergedMin.size()) { @@ -644,7 +652,7 @@ void HCluster::combineFile() { exit(1); } } -/***********************************************************************/ +/*********************************************************************** seqDist HCluster::getNextDist(char* buffer, int& index, int size){ try { seqDist next; diff --git a/hcluster.h b/hcluster.h index bff4328..da7769d 100644 --- a/hcluster.h +++ b/hcluster.h @@ -50,7 +50,7 @@ protected: vector getSeqsAN(); void combineFile(); void processFile(); - seqDist getNextDist(char*, int&, int); + //seqDist getNextDist(char*, int&, int); RAbundVector* rabund; ListVector* list; diff --git a/otuhierarchycommand.cpp b/otuhierarchycommand.cpp new file mode 100644 index 0000000..d095138 --- /dev/null +++ b/otuhierarchycommand.cpp @@ -0,0 +1,279 @@ +/* + * otuhierarchycommand.cpp + * Mothur + * + * Created by westcott on 1/19/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "otuhierarchycommand.h" + +//********************************************************************************************************************** +OtuHierarchyCommand::OtuHierarchyCommand(string option){ + try { + abort = false; + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"list","label"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + listFile = validParameter.validFile(parameters, "list", true); + if (listFile == "not found") { mothurOut("list is a required parameter for the otu.hierarchy command."); mothurOutEndLine(); abort = true; } + else if (listFile == "not open") { abort = true; } + + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { mothurOut("label is a required parameter for the otu.hierarchy command."); mothurOutEndLine(); abort = true; } + else { + splitAtDash(label, labels); + if (labels.size() != 2) { mothurOut("You must provide 2 labels."); mothurOutEndLine(); abort = true; } + } + } + + } + catch(exception& e) { + errorOut(e, "OtuHierarchyCommand", "OtuHierarchyCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void OtuHierarchyCommand::help(){ + try { + mothurOut("The otu.hierarchy command is used to see how otus relate at two distances. \n"); + mothurOut("The otu.hierarchy command parameters are list and label. Both parameters are required. \n"); + mothurOut("The otu.hierarchy command should be in the following format: \n"); + mothurOut("otu.hierarchy(list=yourListFile, label=yourLabels).\n"); + mothurOut("Example otu.hierarchy(list=amazon.fn.list, label=0.01-0.03).\n"); + mothurOut("The otu.hierarchy command outputs a .otu.hierarchy file which is described on the wiki.\n"); + mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n"); + } + catch(exception& e) { + errorOut(e, "OtuHierarchyCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +OtuHierarchyCommand::~OtuHierarchyCommand(){} + +//********************************************************************************************************************** + +int OtuHierarchyCommand::execute(){ + try { + + if (abort == true) { return 0; } + + //get listvectors that correspond to labels requested, (or use smart distancing to get closest listvector) + vector lists = getListVectors(); + + //determine which is little and which is big, putting little first + if (lists.size() == 2) { + //if big is first swap them + if (lists[0].getNumBins() < lists[1].getNumBins()) { + reverse(lists.begin(), lists.end()); + } + }else{ + mothurOut("error getting listvectors, unable to read 2 different vectors, check your label inputs."); mothurOutEndLine(); return 0; + } + + //map sequences to bin number in the "little" otu + map littleBins; + for (int i = 0; i < lists[0].getNumBins(); i++) { + string names = lists[0].get(i); + + //parse bin + while (names.find_first_of(',') != -1) { + string name = names.substr(0,names.find_first_of(',')); + names = names.substr(names.find_first_of(',')+1, names.length()); + littleBins[name] = i; + } + + //get last name + littleBins[names] = i; + } + + ofstream out; + string outputFileName = getRootName(listFile) + lists[0].getLabel() + "-" + lists[1].getLabel() + ".otu.hierarchy"; + openOutputFile(outputFileName, out); + + //go through each bin in "big" otu and output the bins in "little" otu which created it + for (int i = 0; i < lists[1].getNumBins(); i++) { + + string names = lists[1].get(i); + + //output column 1 + out << names << '\t'; + + map bins; //bin numbers in little that are in this bin in big + map::iterator it; + + //parse bin + while (names.find_first_of(',') != -1) { + string name = names.substr(0,names.find_first_of(',')); + names = names.substr(names.find_first_of(',')+1, names.length()); + bins[littleBins[name]] = littleBins[name]; + } + + //get last name + bins[littleBins[names]] = littleBins[names]; + + string col2 = ""; + for (it = bins.begin(); it != bins.end(); it++) { + col2 += lists[0].get(it->first) + "\t"; + } + + //output column 2 + out << col2 << endl; + } + + out.close(); + + return 0; + } + catch(exception& e) { + errorOut(e, "OtuHierarchyCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +//returns a vector of listVectors where "little" vector is first +vector OtuHierarchyCommand::getListVectors() { + try { + + int pos; //to use in smart distancing, position of last read in file + int lastPos; + vector lists; + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + //open file + ifstream in; + openInputFile(listFile, in); + + //get first list vector in file + ListVector* list = NULL; + string lastLabel = ""; + if (!in.eof()) { + pos = in.tellg(); + lastPos = pos; + list = new ListVector(in); + gobble(in); + lastLabel = list->getLabel(); + } + + while ((list != NULL) && (userLabels.size() != 0)) { + + //is this a listvector that we want? + if(labels.count(list->getLabel()) == 1){ + + //make copy of listvector + ListVector temp(*list); + lists.push_back(temp); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + //you have a label the user want that is smaller than this label and the last label has not already been processed + if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + int savePos = in.tellg(); + + //get smart distance line + delete list; + in.seekg(lastPos); + if (!in.eof()) { + list = new ListVector(in); + }else { list = NULL; } + + //make copy of listvector + ListVector temp(*list); + lists.push_back(temp); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + in.seekg(savePos); + } + + lastLabel = list->getLabel(); + lastPos = pos; + + //get next line + delete list; + if (!in.eof()) { + pos = in.tellg(); + list = new ListVector(in); + gobble(in); + }else { list = NULL; } + } + + + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); + needToRun = true; + }else { + mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + + in.seekg(lastPos); + if (!in.eof()) { + list = new ListVector(in); + + //make copy of listvector + ListVector temp(*list); + lists.push_back(temp); + + delete list; + } + } + + + return lists; + } + catch(exception& e) { + errorOut(e, "OtuHierarchyCommand", "getListVectors"); + exit(1); + } +} + +//********************************************************************************************************************** + + + + + diff --git a/otuhierarchycommand.h b/otuhierarchycommand.h new file mode 100644 index 0000000..39795dc --- /dev/null +++ b/otuhierarchycommand.h @@ -0,0 +1,38 @@ +#ifndef OTUHIERARCHYCOMMAND_H +#define OTUHIERARCHYCOMMAND_H +/* + * otuhierarchycommand.h + * Mothur + * + * Created by westcott on 1/19/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" +#include "listvector.hpp" + +//********************************************************************************************************************** + +class OtuHierarchyCommand : public Command { + +public: + OtuHierarchyCommand(string); + ~OtuHierarchyCommand(); + int execute(); + void help(); + +private: + bool abort; + set labels; //holds labels to be used + string label, listFile; + + vector getListVectors(); + +}; + +//********************************************************************************************************************** + +#endif + + diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index 65fcc9a..e39d70b 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -95,7 +95,7 @@ SummarySharedCommand::SummarySharedCommand(string option){ globaldata->Groups = Groups; } - string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "true"; } + string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; } all = isTrue(temp); if (abort == false) { @@ -176,7 +176,7 @@ void SummarySharedCommand::help(){ mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n"); mothurOut("The default value for groups is all the groups in your groupfile.\n"); mothurOut("The label parameter is used to analyze specific labels in your input.\n"); - mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n"); + mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n"); mothurOut("If you use sharedchao and run into memory issues, set all to false. \n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"); mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");