From f06b339c5fc4b6d1b9d2a08fe16bf7670bf7aeb4 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 26 Jun 2012 11:25:18 -0400 Subject: [PATCH] made make.table alias to count.seqs command. added large parameter to count.seqs to allow for creating the table without storing files in ram. added subsampling to rarefaction.shared. added countable current type. --- commandfactory.cpp | 7 +- countseqscommand.cpp | 297 +++++++++++++++++++++++++++++++++++--- countseqscommand.h | 10 +- getcurrentcommand.cpp | 2 + mothurout.cpp | 18 +-- mothurout.h | 5 +- optionparser.cpp | 2 + rarefactsharedcommand.cpp | 206 +++++++++++++++++++++++++- rarefactsharedcommand.h | 5 +- 9 files changed, 510 insertions(+), 42 deletions(-) diff --git a/commandfactory.cpp b/commandfactory.cpp index 54ff4a1..02af676 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -289,6 +289,7 @@ CommandFactory::CommandFactory(){ commands["remove.otulabels"] = "remove.otulabels"; commands["make.contigs"] = "make.contigs"; commands["load.logfile"] = "load.logfile"; + commands["make.table"] = "make.table"; commands["quit"] = "MPIEnabled"; } @@ -482,7 +483,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.shared") { command = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { command = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { command = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { command = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } @@ -636,7 +637,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "make.shared") { pipecommand = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { pipecommand = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { pipecommand = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { pipecommand = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } @@ -776,7 +777,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "make.shared") { shellcommand = new SharedCommand(); } else if(commandName == "get.commandinfo") { shellcommand = new GetCommandInfoCommand(); } else if(commandName == "deunique.tree") { shellcommand = new DeuniqueTreeCommand(); } - else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { shellcommand = new CountSeqsCommand(); } else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 8a18fc7..210dd96 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -16,6 +16,7 @@ vector CountSeqsCommand::setParameters(){ try { CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -33,11 +34,12 @@ vector CountSeqsCommand::setParameters(){ string CountSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.seqs command reads a name file and outputs a .seq.count file. You may also provide a group file to get the counts broken down by group.\n"; + helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file. You may also provide a group file to get the counts broken down by group.\n"; helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n"; + helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n"; helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n"; helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n"; - helpString += "Example count.seqs(name=amazon.names).\n"; + helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n"; helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n"; return helpString; } @@ -56,7 +58,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ it = outputTypes.find(type); if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } else { - if (type == "summary") { outputFileName = "seq.count"; } + if (type == "counttable") { outputFileName = "count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -72,7 +74,7 @@ CountSeqsCommand::CountSeqsCommand(){ abort = true; calledHelp = true; setParameters(); vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand"); @@ -104,7 +106,7 @@ CountSeqsCommand::CountSeqsCommand(string option) { //initialize outputTypes vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -146,6 +148,9 @@ CountSeqsCommand::CountSeqsCommand(string option) { groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = "all"; } m->splitAtDash(groups, Groups); + + string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; } + large = m->isTrue(temp); //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(namefile); } @@ -165,12 +170,45 @@ int CountSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - ofstream out; - string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("summary"); - m->openOutputFile(outputFileName, out); outputTypes["summary"].push_back(outputFileName); - out << "Representative_Sequence\ttotal\t"; + string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("counttable"); + + int total = 0; + if (!large) { total = processSmall(outputFileName); } + else { total = processLarge(outputFileName); } + + if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + + //set rabund file as new current rabundfile + itTypes = outputTypes.find("counttable"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Output File Name: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + m->mothurOutEndLine(); - GroupMap* groupMap; + return 0; + } + + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processSmall(string outputFileName){ + try { + ofstream out; + m->openOutputFile(outputFileName, out); outputTypes["counttable"].push_back(outputFileName); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + + GroupMap* groupMap; if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); @@ -199,7 +237,7 @@ int CountSeqsCommand::execute(){ if (m->control_pressed) { break; } string firstCol, secondCol; - in >> firstCol >> secondCol; m->gobble(in); + in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); vector names; m->splitAtChar(secondCol, names, ','); @@ -240,24 +278,241 @@ int CountSeqsCommand::execute(){ total += names.size(); } in.close(); + out.close(); if (groupfile != "") { delete groupMap; } + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processSmall"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processLarge(string outputFileName){ + try { + set namesOfGroups; + map initial; + for (set::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0; } + ofstream out; + m->openOutputFile(outputFileName, out); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + if (groupfile == "") { out << endl; } + + map namesToIndex; + string outfile = m->getRootName(groupfile) + "sorted.groups.temp"; + string outName = m->getRootName(namefile) + "sorted.name.temp"; + map indexToName; + map indexToGroup; + if (groupfile != "") { + time_t estart = time(NULL); + //convert name file to redundant -> unique. set unique name equal to index so we can use vectors, save name for later. + string newNameFile = m->getRootName(namefile) + ".name.temp"; + string newGroupFile = m->getRootName(groupfile) + ".group.temp"; + indexToName = processNameFile(newNameFile); + indexToGroup = getGroupNames(newGroupFile, namesOfGroups); + + //sort file by first column so the names of sequences will be easier to find + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + string command = "sort -n " + newGroupFile + " -o " + outfile; + system(command.c_str()); + command = "sort -n " + newNameFile + " -o " + outName; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + newGroupFile + " /O " + outfile; + system(command.c_str()); + command = "sort " + newNameFile + " /O " + outName; + system(command.c_str()); + #endif + m->mothurRemove(newNameFile); + m->mothurRemove(newGroupFile); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine(); + }else { outName = namefile; } + + time_t estart = time(NULL); + //open input file + ifstream in; + m->openInputFile(outName, in); + + //open input file + ifstream in2; - if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + int total = 0; + vector< vector > nameMapCount; + if (groupfile != "") { + m->openInputFile(outfile, in2); + nameMapCount.resize(indexToName.size()); + for (int i = 0; i < nameMapCount.size(); i++) { + nameMapCount[i].resize(indexToGroup.size(), 0); + } + } + + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol; + in >> firstCol; m->gobble(in); + + if (groupfile != "") { + int uniqueIndex; + in >> uniqueIndex; m->gobble(in); + + string name; int groupIndex; + in2 >> name >> groupIndex; m->gobble(in2); + + if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; } + + nameMapCount[uniqueIndex][groupIndex]++; + total++; + }else { + string secondCol; + in >> secondCol; m->gobble(in); + int num = m->getNumNames(secondCol); + out << firstCol << '\t' << num << endl; + total += num; + } + } + in.close(); + + if (groupfile != "") { + m->mothurRemove(outfile); + m->mothurRemove(outName); + in2.close(); + for (map::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t'; } + out << endl; + for (int i = 0; i < nameMapCount.size(); i++) { + string totalsLine = ""; + int seqTotal = 0; + for (int j = 0; j < nameMapCount[i].size(); j++) { + seqTotal += nameMapCount[i][j]; + totalsLine += toString(nameMapCount[i][j]) + '\t'; + } + out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl; + } + } + + out.close(); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine(); + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processLarge"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::processNameFile(string name) { + try { + map indexToNames; + + ofstream out; + m->openOutputFile(name, out); + + //open input file + ifstream in; + m->openInputFile(namefile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + m->splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; } + indexToNames[count] = firstCol; + pairDone = false; + count++; + } + } + } + in.close(); + out.close(); - m->mothurOutEndLine(); - m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); - m->mothurOutEndLine(); - m->mothurOut("Output File Name: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); - m->mothurOutEndLine(); + return indexToNames; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processNameFile"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::getGroupNames(string filename, set& namesOfGroups) { + try { + map indexToGroups; + map groupIndex; + map::iterator it; + + ofstream out; + m->openOutputFile(filename, out); + + //open input file + ifstream in; + m->openInputFile(groupfile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + it = groupIndex.find(secondCol); + if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above + groupIndex[secondCol] = count; + count++; + } + out << firstCol << '\t' << groupIndex[secondCol] << endl; + namesOfGroups.insert(secondCol); + pairDone = false; + } + } + } + in.close(); + out.close(); - return 0; + for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; } + + return indexToGroups; } - catch(exception& e) { - m->errorOut(e, "CountSeqsCommand", "execute"); + m->errorOut(e, "CountSeqsCommand", "getGroupNames"); exit(1); } } //********************************************************************************************************************** + + + diff --git a/countseqscommand.h b/countseqscommand.h index 54982c1..555d6a7 100644 --- a/countseqscommand.h +++ b/countseqscommand.h @@ -34,8 +34,14 @@ public: private: string namefile, groupfile, outputDir, groups; - bool abort; - vector Groups; + bool abort, large; + vector Groups, outputNames; + + int processSmall(string); + int processLarge(string); + map processNameFile(string); + map getGroupNames(string, set&); + }; #endif diff --git a/getcurrentcommand.cpp b/getcurrentcommand.cpp index ca83231..c3b5f0a 100644 --- a/getcurrentcommand.cpp +++ b/getcurrentcommand.cpp @@ -140,6 +140,8 @@ int GetCurrentCommand::execute(){ m->setFlowFile(""); }else if (types[i] == "biom") { m->setBiomFile(""); + }else if (types[i] == "counttable") { + m->setCountTableFile(""); }else if (types[i] == "processors") { m->setProcessors("1"); }else if (types[i] == "all") { diff --git a/mothurout.cpp b/mothurout.cpp index 64f0bc8..14840c1 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -43,6 +43,7 @@ set MothurOut::getCurrentTypes() { types.insert("tree"); types.insert("flow"); types.insert("biom"); + types.insert("counttable"); types.insert("processors"); return types; @@ -78,6 +79,7 @@ void MothurOut::printCurrentFiles() { if (treefile != "") { mothurOut("tree=" + treefile); mothurOutEndLine(); } if (flowfile != "") { mothurOut("flow=" + flowfile); mothurOutEndLine(); } if (biomfile != "") { mothurOut("biom=" + biomfile); mothurOutEndLine(); } + if (counttablefile != "") { mothurOut("counttable=" + counttablefile); mothurOutEndLine(); } if (processors != "1") { mothurOut("processors=" + processors); mothurOutEndLine(); } } @@ -112,6 +114,7 @@ bool MothurOut::hasCurrentFiles() { if (treefile != "") { return true; } if (flowfile != "") { return true; } if (biomfile != "") { return true; } + if (counttablefile != "") { return true; } if (processors != "1") { return true; } return hasCurrent; @@ -147,6 +150,7 @@ void MothurOut::clearCurrentFiles() { taxonomyfile = ""; flowfile = ""; biomfile = ""; + counttablefile = ""; processors = "1"; } catch(exception& e) { @@ -1291,16 +1295,6 @@ vector MothurOut::setFilePosEachLine(string filename, int& n positions.push_back(0); while(!in.eof()){ - //unsigned long long lastpos = in.tellg(); - //input = getline(in); - //if (input.length() != 0) { - //unsigned long long pos = in.tellg(); - //if (pos != -1) { positions.push_back(pos - input.length() - 1); } - //else { positions.push_back(lastpos); } - //} - //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions - - //getline counting reads char d = in.get(); count++; while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof())) { @@ -1503,7 +1497,7 @@ vector MothurOut::splitWhiteSpace(string& rest, char buffer[], int size) for (int i = 0; i < size; i++) { if (!isspace(buffer[i])) { rest += buffer[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < size) { //gobble white space if (isspace(buffer[i])) { i++; } else { rest = buffer[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; @@ -1527,7 +1521,7 @@ vector MothurOut::splitWhiteSpace(string input){ for (int i = 0; i < input.length(); i++) { if (!isspace(input[i])) { rest += input[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < input.length()) { //gobble white space if (isspace(input[i])) { i++; } else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; diff --git a/mothurout.h b/mothurout.h index 968ff97..98565dc 100644 --- a/mothurout.h +++ b/mothurout.h @@ -174,6 +174,7 @@ class MothurOut { string getTaxonomyFile() { return taxonomyfile; } string getFlowFile() { return flowfile; } string getBiomFile() { return biomfile; } + string getCountTableFile() { return counttablefile; } string getProcessors() { return processors; } void setListFile(string f) { listfile = getFullPathName(f); } @@ -197,6 +198,7 @@ class MothurOut { void setTaxonomyFile(string f) { taxonomyfile = getFullPathName(f); } void setFlowFile(string f) { flowfile = getFullPathName(f); } void setBiomFile(string f) { biomfile = getFullPathName(f); } + void setCountTableFile(string f) { counttablefile = getFullPathName(f); } void setProcessors(string p) { processors = p; mothurOut("\nUsing " + toString(p) + " processors.\n"); } void printCurrentFiles(); @@ -232,6 +234,7 @@ class MothurOut { processors = "1"; flowfile = ""; biomfile = ""; + counttablefile = ""; gui = false; printedHeaders = false; commandInputsConvertError = false; @@ -246,7 +249,7 @@ class MothurOut { string releaseDate, version; string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile; - string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile; + string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile, counttablefile; vector Groups; vector namesOfGroups; diff --git a/optionparser.cpp b/optionparser.cpp index 2d13cd0..e3850e5 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -124,6 +124,8 @@ map OptionParser::getParameters() { it->second = m->getTaxonomyFile(); }else if (it->first == "biom") { it->second = m->getBiomFile(); + }else if (it->first == "counttable") { + it->second = m->getCountTableFile(); }else { m->mothurOut("[ERROR]: mothur does not save a current file for " + it->first); m->mothurOutEndLine(); } diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp index a312df0..6451041 100644 --- a/rarefactsharedcommand.cpp +++ b/rarefactsharedcommand.cpp @@ -11,6 +11,7 @@ #include "sharedsobs.h" #include "sharednseqs.h" #include "sharedutilities.h" +#include "subsample.h" //********************************************************************************************************************** vector RareFactSharedCommand::setParameters(){ @@ -21,6 +22,8 @@ vector RareFactSharedCommand::setParameters(){ CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "",true,false); parameters.push_back(pcalc); + CommandParameter psubsampleiters("subsampleiters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(psubsampleiters); + CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); @@ -50,6 +53,8 @@ string RareFactSharedCommand::getHelpString(){ helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; helpString += "Example rarefaction.shared(label=unique-0.01-0.03, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n"; helpString += "The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n"; + helpString += "The subsampleiters parameter allows you to choose the number of times you would like to run the subsample.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n"; helpString += "The default value for groups is all the groups in your groupfile, and jumble is true.\n"; helpString += validCalculator.printCalc("sharedrarefaction"); helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -219,6 +224,18 @@ RareFactSharedCommand::RareFactSharedCommand(string option) { temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } groupMode = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "subsampleiters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 1; } } @@ -301,7 +318,35 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } } - + + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = subset[0]->getNumSeqs(); + for (int i = 1; i < subset.size(); i++) { + int thisSize = subset[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + newGroups.clear(); + vector temp; + for (int i = 0; i < subset.size(); i++) { + if (subset[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(subset[i]->getGroup() + " contains " + toString(subset[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete subset[i]; + }else { + newGroups.push_back(subset[i]->getGroup()); + temp.push_back(subset[i]); + } + } + subset = temp; + } + + if (subset.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + } + /******************************************************/ + ValidCalculators validCalculator; for (int i=0; igetSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); } @@ -372,6 +419,8 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve->getSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); @@ -444,6 +493,9 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve = new Rarefact(subset, rDisplays); rCurve->getSharedCurve(freq, nIters); delete rCurve; + + if (subsample) { subsampleLookup(subset, fileNameRoot); } + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } @@ -458,6 +510,158 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } //********************************************************************************************************************** +int RareFactSharedCommand::subsampleLookup(vector& thisLookup, string fileNameRoot) { + try { + + map > filenames; + for (int thisIter = 0; thisIter < iters; thisIter++) { + + vector thisItersLookup = thisLookup; + + //we want the summary results for the whole dataset, then the subsampling + SubSample sample; + vector tempLabels; //dont need since we arent printing the sampled sharedRabunds + + //make copy of lookup so we don't get access violations + vector newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + tempLabels = sample.getSample(newLookup, subsampleSize); + thisItersLookup = newLookup; + + + Rarefact* rCurve; + vector rDisplays; + + string thisfileNameRoot = fileNameRoot + toString(thisIter); + + ValidCalculators validCalculator; + for (int i=0; igetSharedCurve(freq, nIters); + delete rCurve; + + //clean up memory + for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + thisItersLookup.clear(); + for(int i=0;i > > results; //iter -> numSampled -> data + for (map >::iterator it = filenames.begin(); it != filenames.end(); it++) { + vector thisTypesFiles = it->second; + vector columnHeaders; + for (int i = 0; i < thisTypesFiles.size(); i++) { + ifstream in; + m->openInputFile(thisTypesFiles[i], in); + + string headers = m->getline(in); m->gobble(in); + columnHeaders = m->splitWhiteSpace(headers); + int numCols = columnHeaders.size(); + + vector > thisFilesLines; + while (!in.eof()) { + if (m->control_pressed) { break; } + vector data; data.resize(numCols, 0); + //read numSampled line + for (int j = 0; j < numCols; j++) { in >> data[j]; m->gobble(in); } + thisFilesLines.push_back(data); + } + in.close(); + results.push_back(thisFilesLines); + m->mothurRemove(thisTypesFiles[i]); + } + + if (!m->control_pressed) { + //process results + string outputFile = fileNameRoot + "ave-std." + thisLookup[0]->getLabel() + "." + getOutputFileNameTag(it->first); + ofstream out; + m->openOutputFile(outputFile, out); + outputNames.push_back(outputFile); outputTypes[it->first].push_back(outputFile); + + out << columnHeaders[0] << '\t' << "method\t"; + for (int i = 1; i < columnHeaders.size(); i++) { out << columnHeaders[i] << '\t'; } + out << endl; + + vector< vector > aveResults; aveResults.resize(results[0].size()); + for (int i = 0; i < aveResults.size(); i++) { aveResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < aveResults.size(); i++) { //initialize sums to zero. + aveResults[i][0] = results[thisIter][i][0]; + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] += results[thisIter][i][j]; + } + } + } + + for (int i = 0; i < aveResults.size(); i++) { //finds average. + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] /= (float) iters; + } + } + + //standard deviation + vector< vector > stdResults; stdResults.resize(results[0].size()); + for (int i = 0; i < stdResults.size(); i++) { stdResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdResults.size(); i++) { + stdResults[i][0] = aveResults[i][0]; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] += ((results[thisIter][i][j] - aveResults[i][j]) * (results[thisIter][i][j] - aveResults[i][j])); + } + } + } + + for (int i = 0; i < stdResults.size(); i++) { //finds average. + out << aveResults[i][0] << '\t' << "ave\t"; + for (int j = 1; j < aveResults[i].size(); j++) { out << aveResults[i][j] << '\t'; } + out << endl; + out << stdResults[i][0] << '\t' << "std\t"; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] /= (float) iters; + stdResults[i][j] = sqrt(stdResults[i][j]); + out << stdResults[i][j] << '\t'; + } + out << endl; + } + out.close(); + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "subsample"); + exit(1); + } +} +//********************************************************************************************************************** vector RareFactSharedCommand::createGroupFile(vector& outputNames) { try { diff --git a/rarefactsharedcommand.h b/rarefactsharedcommand.h index a5b4546..af73e13 100644 --- a/rarefactsharedcommand.h +++ b/rarefactsharedcommand.h @@ -37,18 +37,19 @@ public: private: vector lookup; - int nIters; + int nIters, subsampleSize, iters; string format; float freq; map file2Group; //index in outputNames[i] -> group - bool abort, allLines, jumble, groupMode; + bool abort, allLines, jumble, groupMode, subsample; set labels; //holds labels to be used string label, calc, groups, outputDir, sharedfile, designfile; vector Estimators, Groups, outputNames, Sets; int process(GroupMap&, string); vector createGroupFile(vector&); + int subsampleLookup(vector&, string); }; -- 2.39.2