X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=summarycommand.cpp;h=43202e56cd2b86d7224a8b46b26526a511c6d55a;hp=83ea8c9b4b5ab077283454089b35d9de13908003;hb=df7e3ff9f68ef157b0328a2d353c3258c5d45d89;hpb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc diff --git a/summarycommand.cpp b/summarycommand.cpp index 83ea8c9..43202e5 100644 --- a/summarycommand.cpp +++ b/summarycommand.cpp @@ -33,21 +33,24 @@ #include "boneh.h" #include "solow.h" #include "shen.h" +#include "subsample.h" //********************************************************************************************************************** vector SummaryCommand::setParameters(){ try { - CommandParameter plist("list", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(plist); - CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(prabund); - CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(psabund); - CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(pshared); - CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); - CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "",true,false); parameters.push_back(pcalc); - CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund); - CommandParameter psize("size", "Number", "", "0", "", "", "",false,false); parameters.push_back(psize); - CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter plist("list", "InputTypes", "", "", "LRSS", "LRSS", "none","summary",false,false,true); parameters.push_back(plist); + CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none","summary",false,false); parameters.push_back(prabund); + CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none","summary",false,false); parameters.push_back(psabund); + CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none","summary",false,false,true); parameters.push_back(pshared); + CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample); + CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "","",true,false,true); parameters.push_back(pcalc); + CommandParameter pabund("abund", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pabund); + CommandParameter psize("size", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psize); + CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pgroupmode); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -63,11 +66,13 @@ string SummaryCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The summary.single command parameters are list, sabund, rabund, shared, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n"; + helpString += "The summary.single command parameters are list, sabund, rabund, shared, subsample, iters, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n"; helpString += "The summary.single command should be in the following format: \n"; helpString += "summary.single(label=yourLabel, calc=yourEstimators).\n"; helpString += "Example summary.single(label=unique-.01-.03, calc=sobs-chao-ace-jack-bootstrap-shannon-npshannon-simpson).\n"; helpString += validCalculator.printCalc("summary"); + helpString += "The subsample parameter allows you to enter the size of the sample or you can set subsample=T and mothur will use the size of your smallest group in the case of a shared file. With a list, sabund or rabund file you must provide a subsample size.\n"; + helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n"; helpString += "The default value calc is sobs-chao-ace-jack-shannon-npshannon-simpson\n"; helpString += "If you are running summary.single with a shared file and would like your summary results collated in one file, set groupmode=t. (Default=true).\n"; helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -79,7 +84,21 @@ string SummaryCommand::getHelpString(){ exit(1); } } - +//********************************************************************************************************************** +string SummaryCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "summary") { pattern = "[filename],summary-[filename],[tag],summary"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "SummaryCommand", "getOutputPattern"); + exit(1); + } +} //********************************************************************************************************************** SummaryCommand::SummaryCommand(){ try { @@ -239,7 +258,22 @@ SummaryCommand::SummaryCommand(string option) { temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } groupMode = m->isTrue(temp); - + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; subsampleSize = -1; } + } + + if (subsample == false) { iters = 0; } + else { + //if you did not set a samplesize and are not using a sharedfile + if ((subsampleSize == -1) && (format != "sharedfile")) { m->mothurOut("[ERROR]: If you want to subsample with a list, rabund or sabund file, you must provide the sample size. You can do this by setting subsample=yourSampleSize.\n"); abort=true; } + } + } } catch(exception& e) { @@ -261,17 +295,23 @@ int SummaryCommand::execute(){ int numLines = 0; int numCols = 0; - + map groupIndex; + for (int p = 0; p < inputFileNames.size(); p++) { numLines = 0; numCols = 0; - string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "summary"; - outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot); - + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])); + string fileNameRoot = getOutputFileName("summary",variables); + variables["[tag]"] = "ave-std"; + string fileNameAve = getOutputFileName("summary",variables); + outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot); + if (inputFileNames.size() > 1) { m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine(); + groupIndex[fileNameRoot] = groups[p]; } sumCalculators.clear(); @@ -342,6 +382,17 @@ int SummaryCommand::execute(){ ofstream outputFileHandle; m->openOutputFile(fileNameRoot, outputFileHandle); outputFileHandle << "label"; + + ofstream outAve; + if (subsample) { + m->openOutputFile(fileNameAve, outAve); + outputNames.push_back(fileNameAve); outputTypes["summary"].push_back(fileNameAve); + outAve << "label\tmethod"; + outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint); + if (inputFileNames.size() > 1) { + groupIndex[fileNameAve] = groups[p]; + } + } input = new InputData(inputFileNames[p], format); sabund = input->getSAbundVector(); @@ -350,24 +401,29 @@ int SummaryCommand::execute(){ for(int i=0;igetCols() == 1){ outputFileHandle << '\t' << sumCalculators[i]->getName(); + if (subsample) { outAve << '\t' << sumCalculators[i]->getName(); } numCols++; } else{ outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; + if (subsample) { outAve << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; } numCols += 3; } } outputFileHandle << endl; + if (subsample) { outAve << endl; } //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetLabel()) == 1){ @@ -375,16 +431,9 @@ int SummaryCommand::execute(){ processedLabels.insert(sabund->getLabel()); userLabels.erase(sabund->getLabel()); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetLabel()); userLabels.erase(sabund->getLabel()); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetSAbundVector(); } - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i::iterator it; @@ -441,21 +483,15 @@ int SummaryCommand::execute(){ sabund = input->getSAbundVector(lastLabel); m->mothurOut(sabund->getLabel()); m->mothurOutEndLine(); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //create summary file containing all the groups data for each label - this function just combines the info from the files already created. - if ((sharedfile != "") && (groupMode)) { outputNames.push_back(createGroupSummaryFile(numLines, numCols, outputNames)); } + if ((sharedfile != "") && (groupMode)) { vector comboNames = createGroupSummaryFile(numLines, numCols, outputNames, groupIndex); for (int i = 0; i < comboNames.size(); i++) { outputNames.push_back(comboNames[i]); } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -484,6 +520,109 @@ int SummaryCommand::execute(){ } } //********************************************************************************************************************** +int SummaryCommand::process(SAbundVector*& sabund, ofstream& outputFileHandle, ofstream& outAve) { + try { + + //calculator -> data -> values + vector< vector< vector > > results; results.resize(sumCalculators.size()); + + outputFileHandle << sabund->getLabel(); + + SubSample sample; + for (int thisIter = 0; thisIter < iters+1; thisIter++) { + + SAbundVector* thisIterSabund = sabund; + + //we want the summary results for the whole dataset, then the subsampling + if ((thisIter > 0) && subsample) { //subsample sabund and run it + //copy sabund since getSample destroys it + RAbundVector rabund = sabund->getRAbundVector(); + SAbundVector* newSabund = new SAbundVector(); + *newSabund = rabund.getSAbundVector(); + + sample.getSample(newSabund, subsampleSize); + thisIterSabund = newSabund; + } + + for(int i=0;i data = sumCalculators[i]->getValues(thisIterSabund); + + if (m->control_pressed) { return 0; } + + if (thisIter == 0) { + outputFileHandle << '\t'; + sumCalculators[i]->print(outputFileHandle); + }else { + //some of the calc have hci and lci need to make room for that + if (results[i].size() == 0) { results[i].resize(data.size()); } + //save results for ave and std. + for (int j = 0; j < data.size(); j++) { + if (m->control_pressed) { return 0; } + results[i][j].push_back(data[j]); + } + } + } + + //cleanup memory + if ((thisIter > 0) && subsample) { delete thisIterSabund; } + } + outputFileHandle << endl; + + if (subsample) { + outAve << sabund->getLabel() << '\t' << "ave\t"; + //find ave and std for this label and output + //will need to modify the createGroupSummary to combine results and not mess with the .summary file. + + //calcs -> values + vector< vector > calcAverages; calcAverages.resize(sumCalculators.size()); + for (int i = 0; i < calcAverages.size(); i++) { calcAverages[i].resize(results[i].size(), 0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j] += results[i][j][thisIter]; + } + } + } + + for (int i = 0; i < calcAverages.size(); i++) { //finds average. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j] /= (float) iters; + outAve << calcAverages[i][j] << '\t'; + } + } + + //find standard deviation + vector< vector > stdDev; stdDev.resize(sumCalculators.size()); + for (int i = 0; i < stdDev.size(); i++) { stdDev[i].resize(results[i].size(), 0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdDev.size(); i++) { + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j] += ((results[i][j][thisIter] - calcAverages[i][j]) * (results[i][j][thisIter] - calcAverages[i][j])); + } + } + } + + outAve << endl << sabund->getLabel() << '\t' << "std\t"; + for (int i = 0; i < stdDev.size(); i++) { //finds average. + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j] /= (float) iters; + stdDev[i][j] = sqrt(stdDev[i][j]); + outAve << stdDev[i][j] << '\t'; + } + } + outAve << endl; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SummaryCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** vector SummaryCommand::parseSharedFile(string filename) { try { vector filenames; @@ -496,12 +635,44 @@ vector SummaryCommand::parseSharedFile(string filename) { string sharedFileRoot = m->getRootName(filename); - //clears file before we start to write to it below + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = lookup[0]->getNumSeqs(); + for (int i = 1; i < lookup.size(); i++) { + int thisSize = lookup[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + m->clearGroups(); + vector Groups; + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + m->setGroups(Groups); + } + + if (lookup.size() < 1) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return filenames; } + } + + + /******************************************************/ + + //clears file before we start to write to it below for (int i=0; imothurRemove((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); } - + ofstream* temp; for (int i=0; i SummaryCommand::parseSharedFile(string filename) { } } //********************************************************************************************************************** -string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector& outputNames) { +vector SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector& outputNames, map groupIndex) { try { - - ofstream out; - string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups.summary"; - - //open combined file - m->openOutputFile(combineFileName, out); - + //open each groups summary file - string newLabel = ""; - map > files; + vector newComboNames; + + map > > files; + map filesTypesLabels; + map filesTypesNumLines; for (int i=0; i thisFilesLines; - + ifstream temp; m->openInputFile(outputNames[i], temp); //read through first line - labels - string tempLabel; - if (i == 0) { //we want to save the labels to output below - for (int j = 0; j < numCols+1; j++) { - temp >> tempLabel; - - if (j == 1) { newLabel += "group\t" + tempLabel + '\t'; - }else{ newLabel += tempLabel + '\t'; } - } - }else{ for (int j = 0; j < numCols+1; j++) { temp >> tempLabel; } } + string labelsLine = m->getline(temp); + vector theseLabels = m->splitWhiteSpace(labelsLine); + + string newLabel = ""; + for (int j = 0; j < theseLabels.size(); j++) { + if (j == 1) { newLabel += "group\t" + theseLabels[j] + '\t'; + }else{ newLabel += theseLabels[j] + '\t'; } + } m->gobble(temp); + int stop = numLines; + if (theseLabels.size() != numCols+1) { stop = numLines*2; } //for each label - for (int k = 0; k < numLines; k++) { + for (int k = 0; k < stop; k++) { string thisLine = ""; string tempLabel; - for (int j = 0; j < numCols+1; j++) { + for (int j = 0; j < theseLabels.size(); j++) { temp >> tempLabel; //save for later - if (j == 1) { thisLine += groups[i] + "\t" + tempLabel + "\t"; } + if (j == 1) { thisLine += groupIndex[outputNames[i]] + "\t" + tempLabel + "\t"; } else{ thisLine += tempLabel + "\t"; } } @@ -588,31 +757,59 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector< m->gobble(temp); } - - files[outputNames[i]] = thisFilesLines; + + string extension = m->getExtension(outputNames[i]); + if (theseLabels.size() != numCols+1) { extension = ".ave-std" + extension; } + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + m->mothurRemove(combineFileName); //remove old file + filesTypesLabels[extension] = newLabel; + filesTypesNumLines[extension] = stop; + + map > >::iterator itFiles = files.find(extension); + if (itFiles != files.end()) { //add new files info to existing type + files[extension][outputNames[i]] = thisFilesLines; + }else { + map > thisFile; + thisFile[outputNames[i]] = thisFilesLines; + files[extension] = thisFile; + } temp.close(); m->mothurRemove(outputNames[i]); } - //output label line to new file - out << newLabel << endl; - - //for each label - for (int k = 0; k < numLines; k++) { - - //grab summary data for each group - for (int i=0; i > >::iterator itFiles = files.begin(); itFiles != files.end(); itFiles++) { + + if (m->control_pressed) { break; } + + string extension = itFiles->first; + map > thisType = itFiles->second; + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + newComboNames.push_back(combineFileName); + //open combined file + ofstream out; + m->openOutputFile(combineFileName, out); + + //output label line to new file + out << filesTypesLabels[extension] << endl; + + //for each label + for (int k = 0; k < filesTypesNumLines[extension]; k++) { + + //grab summary data for each group + for (map >::iterator itType = thisType.begin(); itType != thisType.end(); itType++) { + out << (itType->second)[k]; + } + } + + outputNames.clear(); + + out.close(); + } //return combine file name - return combineFileName; + return newComboNames; } catch(exception& e) {