From: Sarah Westcott Date: Tue, 26 Jun 2012 15:25:18 +0000 (-0400) Subject: made make.table alias to count.seqs command. added large parameter to count.seqs... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f06b339c5fc4b6d1b9d2a08fe16bf7670bf7aeb4 made make.table alias to count.seqs command. added large parameter to count.seqs to allow for creating the table without storing files in ram. added subsampling to rarefaction.shared. added countable current type. --- diff --git a/commandfactory.cpp b/commandfactory.cpp index 54ff4a1..02af676 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -289,6 +289,7 @@ CommandFactory::CommandFactory(){ commands["remove.otulabels"] = "remove.otulabels"; commands["make.contigs"] = "make.contigs"; commands["load.logfile"] = "load.logfile"; + commands["make.table"] = "make.table"; commands["quit"] = "MPIEnabled"; } @@ -482,7 +483,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.shared") { command = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { command = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { command = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { command = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } @@ -636,7 +637,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "make.shared") { pipecommand = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { pipecommand = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { pipecommand = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { pipecommand = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } @@ -776,7 +777,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "make.shared") { shellcommand = new SharedCommand(); } else if(commandName == "get.commandinfo") { shellcommand = new GetCommandInfoCommand(); } else if(commandName == "deunique.tree") { shellcommand = new DeuniqueTreeCommand(); } - else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { shellcommand = new CountSeqsCommand(); } else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 8a18fc7..210dd96 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -16,6 +16,7 @@ vector CountSeqsCommand::setParameters(){ try { CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -33,11 +34,12 @@ vector CountSeqsCommand::setParameters(){ string CountSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.seqs command reads a name file and outputs a .seq.count file. You may also provide a group file to get the counts broken down by group.\n"; + helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file. You may also provide a group file to get the counts broken down by group.\n"; helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n"; + helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n"; helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n"; helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n"; - helpString += "Example count.seqs(name=amazon.names).\n"; + helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n"; helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n"; return helpString; } @@ -56,7 +58,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ it = outputTypes.find(type); if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } else { - if (type == "summary") { outputFileName = "seq.count"; } + if (type == "counttable") { outputFileName = "count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -72,7 +74,7 @@ CountSeqsCommand::CountSeqsCommand(){ abort = true; calledHelp = true; setParameters(); vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand"); @@ -104,7 +106,7 @@ CountSeqsCommand::CountSeqsCommand(string option) { //initialize outputTypes vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -146,6 +148,9 @@ CountSeqsCommand::CountSeqsCommand(string option) { groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = "all"; } m->splitAtDash(groups, Groups); + + string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; } + large = m->isTrue(temp); //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(namefile); } @@ -165,12 +170,45 @@ int CountSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - ofstream out; - string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("summary"); - m->openOutputFile(outputFileName, out); outputTypes["summary"].push_back(outputFileName); - out << "Representative_Sequence\ttotal\t"; + string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("counttable"); + + int total = 0; + if (!large) { total = processSmall(outputFileName); } + else { total = processLarge(outputFileName); } + + if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + + //set rabund file as new current rabundfile + itTypes = outputTypes.find("counttable"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Output File Name: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + m->mothurOutEndLine(); - GroupMap* groupMap; + return 0; + } + + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processSmall(string outputFileName){ + try { + ofstream out; + m->openOutputFile(outputFileName, out); outputTypes["counttable"].push_back(outputFileName); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + + GroupMap* groupMap; if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); @@ -199,7 +237,7 @@ int CountSeqsCommand::execute(){ if (m->control_pressed) { break; } string firstCol, secondCol; - in >> firstCol >> secondCol; m->gobble(in); + in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); vector names; m->splitAtChar(secondCol, names, ','); @@ -240,24 +278,241 @@ int CountSeqsCommand::execute(){ total += names.size(); } in.close(); + out.close(); if (groupfile != "") { delete groupMap; } + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processSmall"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processLarge(string outputFileName){ + try { + set namesOfGroups; + map initial; + for (set::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0; } + ofstream out; + m->openOutputFile(outputFileName, out); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + if (groupfile == "") { out << endl; } + + map namesToIndex; + string outfile = m->getRootName(groupfile) + "sorted.groups.temp"; + string outName = m->getRootName(namefile) + "sorted.name.temp"; + map indexToName; + map indexToGroup; + if (groupfile != "") { + time_t estart = time(NULL); + //convert name file to redundant -> unique. set unique name equal to index so we can use vectors, save name for later. + string newNameFile = m->getRootName(namefile) + ".name.temp"; + string newGroupFile = m->getRootName(groupfile) + ".group.temp"; + indexToName = processNameFile(newNameFile); + indexToGroup = getGroupNames(newGroupFile, namesOfGroups); + + //sort file by first column so the names of sequences will be easier to find + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + string command = "sort -n " + newGroupFile + " -o " + outfile; + system(command.c_str()); + command = "sort -n " + newNameFile + " -o " + outName; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + newGroupFile + " /O " + outfile; + system(command.c_str()); + command = "sort " + newNameFile + " /O " + outName; + system(command.c_str()); + #endif + m->mothurRemove(newNameFile); + m->mothurRemove(newGroupFile); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine(); + }else { outName = namefile; } + + time_t estart = time(NULL); + //open input file + ifstream in; + m->openInputFile(outName, in); + + //open input file + ifstream in2; - if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + int total = 0; + vector< vector > nameMapCount; + if (groupfile != "") { + m->openInputFile(outfile, in2); + nameMapCount.resize(indexToName.size()); + for (int i = 0; i < nameMapCount.size(); i++) { + nameMapCount[i].resize(indexToGroup.size(), 0); + } + } + + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol; + in >> firstCol; m->gobble(in); + + if (groupfile != "") { + int uniqueIndex; + in >> uniqueIndex; m->gobble(in); + + string name; int groupIndex; + in2 >> name >> groupIndex; m->gobble(in2); + + if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; } + + nameMapCount[uniqueIndex][groupIndex]++; + total++; + }else { + string secondCol; + in >> secondCol; m->gobble(in); + int num = m->getNumNames(secondCol); + out << firstCol << '\t' << num << endl; + total += num; + } + } + in.close(); + + if (groupfile != "") { + m->mothurRemove(outfile); + m->mothurRemove(outName); + in2.close(); + for (map::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t'; } + out << endl; + for (int i = 0; i < nameMapCount.size(); i++) { + string totalsLine = ""; + int seqTotal = 0; + for (int j = 0; j < nameMapCount[i].size(); j++) { + seqTotal += nameMapCount[i][j]; + totalsLine += toString(nameMapCount[i][j]) + '\t'; + } + out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl; + } + } + + out.close(); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine(); + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processLarge"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::processNameFile(string name) { + try { + map indexToNames; + + ofstream out; + m->openOutputFile(name, out); + + //open input file + ifstream in; + m->openInputFile(namefile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + m->splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; } + indexToNames[count] = firstCol; + pairDone = false; + count++; + } + } + } + in.close(); + out.close(); - m->mothurOutEndLine(); - m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); - m->mothurOutEndLine(); - m->mothurOut("Output File Name: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); - m->mothurOutEndLine(); + return indexToNames; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processNameFile"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::getGroupNames(string filename, set& namesOfGroups) { + try { + map indexToGroups; + map groupIndex; + map::iterator it; + + ofstream out; + m->openOutputFile(filename, out); + + //open input file + ifstream in; + m->openInputFile(groupfile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + it = groupIndex.find(secondCol); + if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above + groupIndex[secondCol] = count; + count++; + } + out << firstCol << '\t' << groupIndex[secondCol] << endl; + namesOfGroups.insert(secondCol); + pairDone = false; + } + } + } + in.close(); + out.close(); - return 0; + for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; } + + return indexToGroups; } - catch(exception& e) { - m->errorOut(e, "CountSeqsCommand", "execute"); + m->errorOut(e, "CountSeqsCommand", "getGroupNames"); exit(1); } } //********************************************************************************************************************** + + + diff --git a/countseqscommand.h b/countseqscommand.h index 54982c1..555d6a7 100644 --- a/countseqscommand.h +++ b/countseqscommand.h @@ -34,8 +34,14 @@ public: private: string namefile, groupfile, outputDir, groups; - bool abort; - vector Groups; + bool abort, large; + vector Groups, outputNames; + + int processSmall(string); + int processLarge(string); + map processNameFile(string); + map getGroupNames(string, set&); + }; #endif diff --git a/getcurrentcommand.cpp b/getcurrentcommand.cpp index ca83231..c3b5f0a 100644 --- a/getcurrentcommand.cpp +++ b/getcurrentcommand.cpp @@ -140,6 +140,8 @@ int GetCurrentCommand::execute(){ m->setFlowFile(""); }else if (types[i] == "biom") { m->setBiomFile(""); + }else if (types[i] == "counttable") { + m->setCountTableFile(""); }else if (types[i] == "processors") { m->setProcessors("1"); }else if (types[i] == "all") { diff --git a/mothurout.cpp b/mothurout.cpp index 64f0bc8..14840c1 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -43,6 +43,7 @@ set MothurOut::getCurrentTypes() { types.insert("tree"); types.insert("flow"); types.insert("biom"); + types.insert("counttable"); types.insert("processors"); return types; @@ -78,6 +79,7 @@ void MothurOut::printCurrentFiles() { if (treefile != "") { mothurOut("tree=" + treefile); mothurOutEndLine(); } if (flowfile != "") { mothurOut("flow=" + flowfile); mothurOutEndLine(); } if (biomfile != "") { mothurOut("biom=" + biomfile); mothurOutEndLine(); } + if (counttablefile != "") { mothurOut("counttable=" + counttablefile); mothurOutEndLine(); } if (processors != "1") { mothurOut("processors=" + processors); mothurOutEndLine(); } } @@ -112,6 +114,7 @@ bool MothurOut::hasCurrentFiles() { if (treefile != "") { return true; } if (flowfile != "") { return true; } if (biomfile != "") { return true; } + if (counttablefile != "") { return true; } if (processors != "1") { return true; } return hasCurrent; @@ -147,6 +150,7 @@ void MothurOut::clearCurrentFiles() { taxonomyfile = ""; flowfile = ""; biomfile = ""; + counttablefile = ""; processors = "1"; } catch(exception& e) { @@ -1291,16 +1295,6 @@ vector MothurOut::setFilePosEachLine(string filename, int& n positions.push_back(0); while(!in.eof()){ - //unsigned long long lastpos = in.tellg(); - //input = getline(in); - //if (input.length() != 0) { - //unsigned long long pos = in.tellg(); - //if (pos != -1) { positions.push_back(pos - input.length() - 1); } - //else { positions.push_back(lastpos); } - //} - //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions - - //getline counting reads char d = in.get(); count++; while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof())) { @@ -1503,7 +1497,7 @@ vector MothurOut::splitWhiteSpace(string& rest, char buffer[], int size) for (int i = 0; i < size; i++) { if (!isspace(buffer[i])) { rest += buffer[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < size) { //gobble white space if (isspace(buffer[i])) { i++; } else { rest = buffer[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; @@ -1527,7 +1521,7 @@ vector MothurOut::splitWhiteSpace(string input){ for (int i = 0; i < input.length(); i++) { if (!isspace(input[i])) { rest += input[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < input.length()) { //gobble white space if (isspace(input[i])) { i++; } else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; diff --git a/mothurout.h b/mothurout.h index 968ff97..98565dc 100644 --- a/mothurout.h +++ b/mothurout.h @@ -174,6 +174,7 @@ class MothurOut { string getTaxonomyFile() { return taxonomyfile; } string getFlowFile() { return flowfile; } string getBiomFile() { return biomfile; } + string getCountTableFile() { return counttablefile; } string getProcessors() { return processors; } void setListFile(string f) { listfile = getFullPathName(f); } @@ -197,6 +198,7 @@ class MothurOut { void setTaxonomyFile(string f) { taxonomyfile = getFullPathName(f); } void setFlowFile(string f) { flowfile = getFullPathName(f); } void setBiomFile(string f) { biomfile = getFullPathName(f); } + void setCountTableFile(string f) { counttablefile = getFullPathName(f); } void setProcessors(string p) { processors = p; mothurOut("\nUsing " + toString(p) + " processors.\n"); } void printCurrentFiles(); @@ -232,6 +234,7 @@ class MothurOut { processors = "1"; flowfile = ""; biomfile = ""; + counttablefile = ""; gui = false; printedHeaders = false; commandInputsConvertError = false; @@ -246,7 +249,7 @@ class MothurOut { string releaseDate, version; string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile; - string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile; + string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile, counttablefile; vector Groups; vector namesOfGroups; diff --git a/optionparser.cpp b/optionparser.cpp index 2d13cd0..e3850e5 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -124,6 +124,8 @@ map OptionParser::getParameters() { it->second = m->getTaxonomyFile(); }else if (it->first == "biom") { it->second = m->getBiomFile(); + }else if (it->first == "counttable") { + it->second = m->getCountTableFile(); }else { m->mothurOut("[ERROR]: mothur does not save a current file for " + it->first); m->mothurOutEndLine(); } diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp index a312df0..6451041 100644 --- a/rarefactsharedcommand.cpp +++ b/rarefactsharedcommand.cpp @@ -11,6 +11,7 @@ #include "sharedsobs.h" #include "sharednseqs.h" #include "sharedutilities.h" +#include "subsample.h" //********************************************************************************************************************** vector RareFactSharedCommand::setParameters(){ @@ -21,6 +22,8 @@ vector RareFactSharedCommand::setParameters(){ CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "",true,false); parameters.push_back(pcalc); + CommandParameter psubsampleiters("subsampleiters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(psubsampleiters); + CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); @@ -50,6 +53,8 @@ string RareFactSharedCommand::getHelpString(){ helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; helpString += "Example rarefaction.shared(label=unique-0.01-0.03, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n"; helpString += "The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n"; + helpString += "The subsampleiters parameter allows you to choose the number of times you would like to run the subsample.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n"; helpString += "The default value for groups is all the groups in your groupfile, and jumble is true.\n"; helpString += validCalculator.printCalc("sharedrarefaction"); helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -219,6 +224,18 @@ RareFactSharedCommand::RareFactSharedCommand(string option) { temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } groupMode = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "subsampleiters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 1; } } @@ -301,7 +318,35 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } } - + + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = subset[0]->getNumSeqs(); + for (int i = 1; i < subset.size(); i++) { + int thisSize = subset[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + newGroups.clear(); + vector temp; + for (int i = 0; i < subset.size(); i++) { + if (subset[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(subset[i]->getGroup() + " contains " + toString(subset[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete subset[i]; + }else { + newGroups.push_back(subset[i]->getGroup()); + temp.push_back(subset[i]); + } + } + subset = temp; + } + + if (subset.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + } + /******************************************************/ + ValidCalculators validCalculator; for (int i=0; igetSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); } @@ -372,6 +419,8 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve->getSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); @@ -444,6 +493,9 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve = new Rarefact(subset, rDisplays); rCurve->getSharedCurve(freq, nIters); delete rCurve; + + if (subsample) { subsampleLookup(subset, fileNameRoot); } + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } @@ -458,6 +510,158 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } //********************************************************************************************************************** +int RareFactSharedCommand::subsampleLookup(vector& thisLookup, string fileNameRoot) { + try { + + map > filenames; + for (int thisIter = 0; thisIter < iters; thisIter++) { + + vector thisItersLookup = thisLookup; + + //we want the summary results for the whole dataset, then the subsampling + SubSample sample; + vector tempLabels; //dont need since we arent printing the sampled sharedRabunds + + //make copy of lookup so we don't get access violations + vector newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + tempLabels = sample.getSample(newLookup, subsampleSize); + thisItersLookup = newLookup; + + + Rarefact* rCurve; + vector rDisplays; + + string thisfileNameRoot = fileNameRoot + toString(thisIter); + + ValidCalculators validCalculator; + for (int i=0; igetSharedCurve(freq, nIters); + delete rCurve; + + //clean up memory + for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + thisItersLookup.clear(); + for(int i=0;i > > results; //iter -> numSampled -> data + for (map >::iterator it = filenames.begin(); it != filenames.end(); it++) { + vector thisTypesFiles = it->second; + vector columnHeaders; + for (int i = 0; i < thisTypesFiles.size(); i++) { + ifstream in; + m->openInputFile(thisTypesFiles[i], in); + + string headers = m->getline(in); m->gobble(in); + columnHeaders = m->splitWhiteSpace(headers); + int numCols = columnHeaders.size(); + + vector > thisFilesLines; + while (!in.eof()) { + if (m->control_pressed) { break; } + vector data; data.resize(numCols, 0); + //read numSampled line + for (int j = 0; j < numCols; j++) { in >> data[j]; m->gobble(in); } + thisFilesLines.push_back(data); + } + in.close(); + results.push_back(thisFilesLines); + m->mothurRemove(thisTypesFiles[i]); + } + + if (!m->control_pressed) { + //process results + string outputFile = fileNameRoot + "ave-std." + thisLookup[0]->getLabel() + "." + getOutputFileNameTag(it->first); + ofstream out; + m->openOutputFile(outputFile, out); + outputNames.push_back(outputFile); outputTypes[it->first].push_back(outputFile); + + out << columnHeaders[0] << '\t' << "method\t"; + for (int i = 1; i < columnHeaders.size(); i++) { out << columnHeaders[i] << '\t'; } + out << endl; + + vector< vector > aveResults; aveResults.resize(results[0].size()); + for (int i = 0; i < aveResults.size(); i++) { aveResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < aveResults.size(); i++) { //initialize sums to zero. + aveResults[i][0] = results[thisIter][i][0]; + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] += results[thisIter][i][j]; + } + } + } + + for (int i = 0; i < aveResults.size(); i++) { //finds average. + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] /= (float) iters; + } + } + + //standard deviation + vector< vector > stdResults; stdResults.resize(results[0].size()); + for (int i = 0; i < stdResults.size(); i++) { stdResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdResults.size(); i++) { + stdResults[i][0] = aveResults[i][0]; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] += ((results[thisIter][i][j] - aveResults[i][j]) * (results[thisIter][i][j] - aveResults[i][j])); + } + } + } + + for (int i = 0; i < stdResults.size(); i++) { //finds average. + out << aveResults[i][0] << '\t' << "ave\t"; + for (int j = 1; j < aveResults[i].size(); j++) { out << aveResults[i][j] << '\t'; } + out << endl; + out << stdResults[i][0] << '\t' << "std\t"; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] /= (float) iters; + stdResults[i][j] = sqrt(stdResults[i][j]); + out << stdResults[i][j] << '\t'; + } + out << endl; + } + out.close(); + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "subsample"); + exit(1); + } +} +//********************************************************************************************************************** vector RareFactSharedCommand::createGroupFile(vector& outputNames) { try { diff --git a/rarefactsharedcommand.h b/rarefactsharedcommand.h index a5b4546..af73e13 100644 --- a/rarefactsharedcommand.h +++ b/rarefactsharedcommand.h @@ -37,18 +37,19 @@ public: private: vector lookup; - int nIters; + int nIters, subsampleSize, iters; string format; float freq; map file2Group; //index in outputNames[i] -> group - bool abort, allLines, jumble, groupMode; + bool abort, allLines, jumble, groupMode, subsample; set labels; //holds labels to be used string label, calc, groups, outputDir, sharedfile, designfile; vector Estimators, Groups, outputNames, Sets; int process(GroupMap&, string); vector createGroupFile(vector&); + int subsampleLookup(vector&, string); };