X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=rarefactsharedcommand.cpp;h=64b40aad41b9fb520892cab6969c98fa0a83a80d;hp=726ddd61b75c6185cca6b4ebac108b7dd1b4d44f;hb=df7e3ff9f68ef157b0328a2d353c3258c5d45d89;hpb=05c52893c6c2467381fe7e7b769d86b6209af2e1 diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp index 726ddd6..64b40aa 100644 --- a/rarefactsharedcommand.cpp +++ b/rarefactsharedcommand.cpp @@ -11,21 +11,25 @@ #include "sharedsobs.h" #include "sharednseqs.h" #include "sharedutilities.h" +#include "subsample.h" //********************************************************************************************************************** vector RareFactSharedCommand::setParameters(){ try { - CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); - CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pdesign); - CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); - CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); - CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); - CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "",true,false); parameters.push_back(pcalc); - CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble); - CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); - CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pshared); + CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pdesign); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pfreq("freq", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pfreq); + CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); + CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "","",true,false,true); parameters.push_back(pcalc); + CommandParameter psubsampleiters("subsampleiters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(psubsampleiters); + CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample); + CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pjumble); + CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); + CommandParameter psets("sets", "String", "", "", "", "", "","",false,false); parameters.push_back(psets); + CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pgroupmode); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -41,8 +45,7 @@ string RareFactSharedCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The collect.shared command parameters are shared, label, freq, calc and groups. shared is required if there is no current sharedfile. \n"; - helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble and calc. shared is required if there is no current sharedfile. \n"; + helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble, groupmode and calc. shared is required if there is no current sharedfile. \n"; helpString += "The design parameter allows you to assign your groups to sets. If provided mothur will run rarefaction.shared on a per set basis. \n"; helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n"; helpString += "The rarefaction command should be in the following format: \n"; @@ -50,6 +53,8 @@ string RareFactSharedCommand::getHelpString(){ helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; helpString += "Example rarefaction.shared(label=unique-0.01-0.03, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n"; helpString += "The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n"; + helpString += "The subsampleiters parameter allows you to choose the number of times you would like to run the subsample.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n"; helpString += "The default value for groups is all the groups in your groupfile, and jumble is true.\n"; helpString += validCalculator.printCalc("sharedrarefaction"); helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -62,7 +67,22 @@ string RareFactSharedCommand::getHelpString(){ exit(1); } } - +//********************************************************************************************************************** +string RareFactSharedCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "sharedrarefaction") { pattern = "[filename],shared.rarefaction"; } + else if (type == "sharedr_nseqs") { pattern = "[filename],shared.r_nseqs"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "getOutputPattern"); + exit(1); + } +} //********************************************************************************************************************** RareFactSharedCommand::RareFactSharedCommand(){ try { @@ -196,6 +216,21 @@ RareFactSharedCommand::RareFactSharedCommand(string option) { if (m->isTrue(temp)) { jumble = true; } else { jumble = false; } m->jumble = jumble; + + temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } + groupMode = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "subsampleiters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 1; } } @@ -226,6 +261,8 @@ int RareFactSharedCommand::execute(){ for (int i = 0; i < Sets.size(); i++) { process(designMap, Sets[i]); } + + if (groupMode) { outputNames = createGroupFile(outputNames); } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -276,18 +313,49 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } } - + + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = subset[0]->getNumSeqs(); + for (int i = 1; i < subset.size(); i++) { + int thisSize = subset[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + newGroups.clear(); + vector temp; + for (int i = 0; i < subset.size(); i++) { + if (subset[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(subset[i]->getGroup() + " contains " + toString(subset[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete subset[i]; + }else { + newGroups.push_back(subset[i]->getGroup()); + temp.push_back(subset[i]); + } + } + subset = temp; + } + + if (subset.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + } + /******************************************************/ + + map variables; + variables["[filename]"] = fileNameRoot; ValidCalculators validCalculator; for (int i=0; igetSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); } @@ -346,6 +416,8 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve->getSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); @@ -418,6 +490,9 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve = new Rarefact(subset, rDisplays); rCurve->getSharedCurve(freq, nIters); delete rCurve; + + if (subsample) { subsampleLookup(subset, fileNameRoot); } + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } @@ -432,3 +507,318 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } //********************************************************************************************************************** +int RareFactSharedCommand::subsampleLookup(vector& thisLookup, string fileNameRoot) { + try { + + map > filenames; + for (int thisIter = 0; thisIter < iters; thisIter++) { + + vector thisItersLookup = thisLookup; + + //we want the summary results for the whole dataset, then the subsampling + SubSample sample; + vector tempLabels; //dont need since we arent printing the sampled sharedRabunds + + //make copy of lookup so we don't get access violations + vector newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + tempLabels = sample.getSample(newLookup, subsampleSize); + thisItersLookup = newLookup; + + + Rarefact* rCurve; + vector rDisplays; + + string thisfileNameRoot = fileNameRoot + toString(thisIter); + + map variables; + variables["[filename]"] = thisfileNameRoot; + + ValidCalculators validCalculator; + for (int i=0; igetSharedCurve(freq, nIters); + delete rCurve; + + //clean up memory + for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + thisItersLookup.clear(); + for(int i=0;i > > results; //iter -> numSampled -> data + for (map >::iterator it = filenames.begin(); it != filenames.end(); it++) { + vector thisTypesFiles = it->second; + vector columnHeaders; + for (int i = 0; i < thisTypesFiles.size(); i++) { + ifstream in; + m->openInputFile(thisTypesFiles[i], in); + + string headers = m->getline(in); m->gobble(in); + columnHeaders = m->splitWhiteSpace(headers); + int numCols = columnHeaders.size(); + + vector > thisFilesLines; + while (!in.eof()) { + if (m->control_pressed) { break; } + vector data; data.resize(numCols, 0); + //read numSampled line + for (int j = 0; j < numCols; j++) { in >> data[j]; m->gobble(in); } + thisFilesLines.push_back(data); + } + in.close(); + results.push_back(thisFilesLines); + m->mothurRemove(thisTypesFiles[i]); + } + + if (!m->control_pressed) { + //process results + map variables; variables["[filename]"] = fileNameRoot + "ave-std." + thisLookup[0]->getLabel() + "."; + + string outputFile = getOutputFileName(it->first,variables); + ofstream out; + m->openOutputFile(outputFile, out); + outputNames.push_back(outputFile); outputTypes[it->first].push_back(outputFile); + + out << columnHeaders[0] << '\t' << "method\t"; + for (int i = 1; i < columnHeaders.size(); i++) { out << columnHeaders[i] << '\t'; } + out << endl; + + vector< vector > aveResults; aveResults.resize(results[0].size()); + for (int i = 0; i < aveResults.size(); i++) { aveResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < aveResults.size(); i++) { //initialize sums to zero. + aveResults[i][0] = results[thisIter][i][0]; + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] += results[thisIter][i][j]; + } + } + } + + for (int i = 0; i < aveResults.size(); i++) { //finds average. + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] /= (float) iters; + } + } + + //standard deviation + vector< vector > stdResults; stdResults.resize(results[0].size()); + for (int i = 0; i < stdResults.size(); i++) { stdResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdResults.size(); i++) { + stdResults[i][0] = aveResults[i][0]; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] += ((results[thisIter][i][j] - aveResults[i][j]) * (results[thisIter][i][j] - aveResults[i][j])); + } + } + } + + for (int i = 0; i < stdResults.size(); i++) { //finds average. + out << aveResults[i][0] << '\t' << "ave\t"; + for (int j = 1; j < aveResults[i].size(); j++) { out << aveResults[i][j] << '\t'; } + out << endl; + out << stdResults[i][0] << '\t' << "std\t"; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] /= (float) iters; + stdResults[i][j] = sqrt(stdResults[i][j]); + out << stdResults[i][j] << '\t'; + } + out << endl; + } + out.close(); + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "subsample"); + exit(1); + } +} +//********************************************************************************************************************** +vector RareFactSharedCommand::createGroupFile(vector& outputNames) { + try { + + vector newFileNames; + + //find different types of files + map > typesFiles; + map > > fileLabels; //combofile name to labels. each label is a vector because it may be unique lci hci. + vector groupNames; + for (int i = 0; i < outputNames.size(); i++) { + + string extension = m->getExtension(outputNames[i]); + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + m->mothurRemove(combineFileName); //remove old file + + ifstream in; + m->openInputFile(outputNames[i], in); + + string labels = m->getline(in); + + istringstream iss (labels,istringstream::in); + string newLabel = ""; vector theseLabels; + while(!iss.eof()) { iss >> newLabel; m->gobble(iss); theseLabels.push_back(newLabel); } + vector< vector > allLabels; + vector thisSet; thisSet.push_back(theseLabels[0]); allLabels.push_back(thisSet); thisSet.clear(); //makes "numSampled" its own grouping + for (int j = 1; j < theseLabels.size()-1; j++) { + if (theseLabels[j+1] == "lci") { + thisSet.push_back(theseLabels[j]); + thisSet.push_back(theseLabels[j+1]); + thisSet.push_back(theseLabels[j+2]); + j++; j++; + }else{ //no lci or hci for this calc. + thisSet.push_back(theseLabels[j]); + } + allLabels.push_back(thisSet); + thisSet.clear(); + } + fileLabels[combineFileName] = allLabels; + + map >::iterator itfind = typesFiles.find(extension); + if (itfind != typesFiles.end()) { + (itfind->second)[outputNames[i]] = file2Group[i]; + }else { + map temp; + temp[outputNames[i]] = file2Group[i]; + typesFiles[extension] = temp; + } + if (!(m->inUsersGroups(file2Group[i], groupNames))) { groupNames.push_back(file2Group[i]); } + } + + //for each type create a combo file + + for (map >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) { + + ofstream out; + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first; + m->openOutputFileAppend(combineFileName, out); + newFileNames.push_back(combineFileName); + map thisTypesFiles = it->second; //it->second maps filename to group + set numSampledSet; + + //open each type summary file + map > > > files; //maps file name to lines in file + int maxLines = 0; + for (map::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) { + + string thisfilename = itFileNameGroup->first; + string group = itFileNameGroup->second; + + ifstream temp; + m->openInputFile(thisfilename, temp); + + //read through first line - labels + m->getline(temp); m->gobble(temp); + + map > > thisFilesLines; + while (!temp.eof()){ + int numSampled = 0; + temp >> numSampled; m->gobble(temp); + + vector< vector > theseReads; + vector thisSet; thisSet.push_back(toString(numSampled)); theseReads.push_back(thisSet); thisSet.clear(); + for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A + vector reads; + string next = ""; + for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels + temp >> next; m->gobble(temp); + reads.push_back(next); + } + theseReads.push_back(reads); + } + thisFilesLines[numSampled] = theseReads; + m->gobble(temp); + + numSampledSet.insert(numSampled); + } + + files[group] = thisFilesLines; + + //save longest file for below + if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); } + + temp.close(); + m->mothurRemove(thisfilename); + } + + //output new labels line + out << fileLabels[combineFileName][0][0] << '\t'; + for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A + for (int n = 0; n < groupNames.size(); n++) { // for each group + for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels + out << fileLabels[combineFileName][k][l] << '-' << groupNames[n] << '\t'; + } + } + } + out << endl; + + //for each label + for (set::iterator itNumSampled = numSampledSet.begin(); itNumSampled != numSampledSet.end(); itNumSampled++) { + + out << (*itNumSampled) << '\t'; + + if (m->control_pressed) { break; } + + for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //each chunk + //grab data for each group + for (map > > >::iterator itFileNameGroup = files.begin(); itFileNameGroup != files.end(); itFileNameGroup++) { + + string group = itFileNameGroup->first; + + map > >::iterator itLine = files[group].find(*itNumSampled); + if (itLine != files[group].end()) { + for (int l = 0; l < (itLine->second)[k].size(); l++) { + out << (itLine->second)[k][l] << '\t'; + + } + }else { + for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { + out << "NA" << '\t'; + } + } + } + } + out << endl; + } + out.close(); + } + + //return combine file name + return newFileNames; + + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "createGroupFile"); + exit(1); + } +} +//**********************************************************************************************************************