From: Sarah Westcott Date: Mon, 4 Jun 2012 16:27:23 +0000 (-0400) Subject: added check to make sure shhh.flows child processes finish properly. added subsamplin... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=36a6b02cf7f09d2bc34376b588944a9ca73429c5 added check to make sure shhh.flows child processes finish properly. added subsampling to summary.shared and summary.single. modified dist.shared to run original dataset as well as subsamples when subsample=t --- diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index 87929a4..05cd18a 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -462,11 +462,11 @@ int MatrixOutputCommand::process(vector thisLookup){ vector< vector< vector > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files vector< vector > calcDists; calcDists.resize(matrixCalculators.size()); - for (int thisIter = 0; thisIter < iters; thisIter++) { + for (int thisIter = 0; thisIter < iters+1; thisIter++) { vector thisItersLookup = thisLookup; - if (subsample) { + if (subsample && (thisIter != 0)) { SubSample sample; vector tempLabels; //dont need since we arent printing the sampled sharedRabunds @@ -619,14 +619,40 @@ int MatrixOutputCommand::process(vector thisLookup){ #endif } - calcDistsTotals.push_back(calcDists); - - if (subsample) { - + if (subsample && (thisIter != 0)) { + calcDistsTotals.push_back(calcDists); //clean up memory for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } thisItersLookup.clear(); for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); } + }else { //print results for whole dataset + for (int i = 0; i < calcDists.size(); i++) { + if (m->control_pressed) { break; } + + //initialize matrix + vector< vector > matrix; //square matrix to represent the distance + matrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } + + for (int j = 0; j < calcDists[i].size(); j++) { + int row = calcDists[i][j].seq1; + int column = calcDists[i][j].seq2; + double dist = calcDists[i][j].dist; + + matrix[row][column] = dist; + matrix[column][row] = dist; + } + + string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; + outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); + ofstream outDist; + m->openOutputFile(distFileName, outDist); + outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); + + printSims(outDist, matrix); + + outDist.close(); + } } } @@ -729,35 +755,6 @@ int MatrixOutputCommand::process(vector thisLookup){ outSTD.close(); } - }else { - - for (int i = 0; i < calcDists.size(); i++) { - if (m->control_pressed) { break; } - - //initialize matrix - vector< vector > matrix; //square matrix to represent the distance - matrix.resize(thisLookup.size()); - for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } - - for (int j = 0; j < calcDists[i].size(); j++) { - int row = calcDists[i][j].seq1; - int column = calcDists[i][j].seq2; - double dist = calcDists[i][j].dist; - - matrix[row][column] = dist; - matrix[column][row] = dist; - } - - string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; - outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); - ofstream outDist; - m->openOutputFile(distFileName, outDist); - outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); - - printSims(outDist, matrix); - - outDist.close(); - } } return 0; diff --git a/shhhercommand.cpp b/shhhercommand.cpp index be410d9..08bb017 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -1934,12 +1934,14 @@ int ShhherCommand::createProcesses(vector filenames){ //divide the groups between the processors vector lines; + vector numFilesToComplete; int numFilesPerProcessor = filenames.size() / processors; for (int i = 0; i < processors; i++) { int startIndex = i * numFilesPerProcessor; int endIndex = (i+1) * numFilesPerProcessor; if(i == (processors - 1)){ endIndex = filenames.size(); } lines.push_back(linePair(startIndex, endIndex)); + numFilesToComplete.push_back((endIndex-startIndex)); } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) @@ -1953,6 +1955,14 @@ int ShhherCommand::createProcesses(vector filenames){ process++; }else if (pid == 0){ num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end); + + //pass numSeqs to parent + ofstream out; + string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); @@ -2015,6 +2025,18 @@ int ShhherCommand::createProcesses(vector filenames){ #endif for (int i=0;iopenInputFile(tempFile, in); + if (!in.eof()) { + int tempNum = 0; + in >> tempNum; + if (tempNum != numFilesToComplete[i+1]) { + m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches. The flow files may be too large to process with multiple processors. \n"); + } + } + in.close(); m->mothurRemove(tempFile); + if (compositeFASTAFileName != "") { m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName); m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName); @@ -2083,6 +2105,8 @@ vector ShhherCommand::parseFlowFiles(string filename){ int ShhherCommand::driver(vector filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){ try { + int numCompleted = 0; + for(int i=start;icontrol_pressed) { break; } @@ -2281,12 +2305,13 @@ int ShhherCommand::driver(vector filenames, string thisCompositeFASTAFil } } + numCompleted++; m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - return 0; + return numCompleted; }catch(exception& e) { m->errorOut(e, "ShhherCommand", "driver"); diff --git a/subsample.cpp b/subsample.cpp index 457b7b9..f7da25f 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -322,8 +322,51 @@ int SubSample::eliminateZeroOTUS(vector& thislookup) { exit(1); } } +//********************************************************************************************************************** +int SubSample::getSample(SAbundVector*& sabund, int size) { + try { + + OrderVector* order = new OrderVector(); + *order = sabund->getOrderVector(NULL); + + int numBins = order->getNumBins(); + int thisSize = order->getNumSeqs(); + + if (thisSize > size) { + random_shuffle(order->begin(), order->end()); + + RAbundVector* rabund = new RAbundVector(numBins); + rabund->setLabel(order->getLabel()); - + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { delete order; delete rabund; return 0; } + + int bin = order->get(j); + + int abund = rabund->get(bin); + rabund->set(bin, (abund+1)); + } + + delete sabund; + sabund = new SAbundVector(); + *sabund = rabund->getSAbundVector(); + delete rabund; + + }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; } + + if (m->control_pressed) { return 0; } + + delete order; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/subsample.h b/subsample.h index feca37e..b00f1a7 100644 --- a/subsample.h +++ b/subsample.h @@ -27,6 +27,7 @@ class SubSample { //Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". + int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it private: diff --git a/summarycommand.cpp b/summarycommand.cpp index 83ea8c9..85f0970 100644 --- a/summarycommand.cpp +++ b/summarycommand.cpp @@ -33,6 +33,7 @@ #include "boneh.h" #include "solow.h" #include "shen.h" +#include "subsample.h" //********************************************************************************************************************** vector SummaryCommand::setParameters(){ @@ -41,6 +42,8 @@ vector SummaryCommand::setParameters(){ CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(prabund); CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(psabund); CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(pshared); + CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); + CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "",true,false); parameters.push_back(pcalc); CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund); @@ -63,11 +66,13 @@ string SummaryCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The summary.single command parameters are list, sabund, rabund, shared, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n"; + helpString += "The summary.single command parameters are list, sabund, rabund, shared, subsample, iters, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n"; helpString += "The summary.single command should be in the following format: \n"; helpString += "summary.single(label=yourLabel, calc=yourEstimators).\n"; helpString += "Example summary.single(label=unique-.01-.03, calc=sobs-chao-ace-jack-bootstrap-shannon-npshannon-simpson).\n"; helpString += validCalculator.printCalc("summary"); + helpString += "The subsample parameter allows you to enter the size of the sample or you can set subsample=T and mothur will use the size of your smallest group in the case of a shared file. With a list, sabund or rabund file you must provide a subsample size.\n"; + helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n"; helpString += "The default value calc is sobs-chao-ace-jack-shannon-npshannon-simpson\n"; helpString += "If you are running summary.single with a shared file and would like your summary results collated in one file, set groupmode=t. (Default=true).\n"; helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -239,7 +244,22 @@ SummaryCommand::SummaryCommand(string option) { temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } groupMode = m->isTrue(temp); - + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; subsampleSize = -1; } + } + + if (subsample == false) { iters = 1; } + else { + //if you did not set a samplesize and are not using a sharedfile + if ((subsampleSize == -1) && (format != "sharedfile")) { m->mothurOut("[ERROR]: If you want to subsample with a list, rabund or sabund file, you must provide the sample size. You can do this by setting subsample=yourSampleSize.\n"); abort=true; } + } + } } catch(exception& e) { @@ -261,17 +281,23 @@ int SummaryCommand::execute(){ int numLines = 0; int numCols = 0; - + map groupIndex; + for (int p = 0; p < inputFileNames.size(); p++) { numLines = 0; numCols = 0; string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "summary"; + string fileNameAve = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "ave"; + string fileNameSTD = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "std"; outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot); + + if (inputFileNames.size() > 1) { m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine(); + groupIndex[fileNameRoot] = groups[p]; } sumCalculators.clear(); @@ -342,6 +368,21 @@ int SummaryCommand::execute(){ ofstream outputFileHandle; m->openOutputFile(fileNameRoot, outputFileHandle); outputFileHandle << "label"; + + ofstream outAve, outSTD; + if (subsample) { + m->openOutputFile(fileNameAve, outAve); + m->openOutputFile(fileNameSTD, outSTD); + outputNames.push_back(fileNameAve); outputTypes["ave"].push_back(fileNameAve); + outputNames.push_back(fileNameSTD); outputTypes["std"].push_back(fileNameSTD); + outAve << "label"; outSTD << "label"; + outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint); + outSTD.setf(ios::fixed, ios::floatfield); outSTD.setf(ios::showpoint); + if (inputFileNames.size() > 1) { + groupIndex[fileNameAve] = groups[p]; + groupIndex[fileNameSTD] = groups[p]; + } + } input = new InputData(inputFileNames[p], format); sabund = input->getSAbundVector(); @@ -350,24 +391,29 @@ int SummaryCommand::execute(){ for(int i=0;igetCols() == 1){ outputFileHandle << '\t' << sumCalculators[i]->getName(); + if (subsample) { outAve << '\t' << sumCalculators[i]->getName(); outSTD << '\t' << sumCalculators[i]->getName(); } numCols++; } else{ outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; + if (subsample) { outAve << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; outSTD << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; } numCols += 3; } } outputFileHandle << endl; + if (subsample) { outSTD << endl; outAve << endl; } //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetLabel()) == 1){ @@ -375,16 +421,9 @@ int SummaryCommand::execute(){ processedLabels.insert(sabund->getLabel()); userLabels.erase(sabund->getLabel()); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve, outSTD); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetLabel()); userLabels.erase(sabund->getLabel()); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve, outSTD); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;igetSAbundVector(); } - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i::iterator it; @@ -441,21 +473,15 @@ int SummaryCommand::execute(){ sabund = input->getSAbundVector(lastLabel); m->mothurOut(sabund->getLabel()); m->mothurOutEndLine(); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - - if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;iprint(outputFileHandle); - } - outputFileHandle << endl; + process(sabund, outputFileHandle, outAve, outSTD); + + if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;icontrol_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //create summary file containing all the groups data for each label - this function just combines the info from the files already created. - if ((sharedfile != "") && (groupMode)) { outputNames.push_back(createGroupSummaryFile(numLines, numCols, outputNames)); } + if ((sharedfile != "") && (groupMode)) { vector comboNames = createGroupSummaryFile(numLines, numCols, outputNames, groupIndex); for (int i = 0; i < comboNames.size(); i++) { outputNames.push_back(comboNames[i]); } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -484,6 +510,108 @@ int SummaryCommand::execute(){ } } //********************************************************************************************************************** +int SummaryCommand::process(SAbundVector*& sabund, ofstream& outputFileHandle, ofstream& outAve, ofstream& outStd) { + try { + + //calculator -> data -> values + vector< vector< vector > > results; results.resize(sumCalculators.size()); + + outputFileHandle << sabund->getLabel(); + + SubSample sample; + for (int thisIter = 0; thisIter < iters+1; thisIter++) { + + SAbundVector* thisIterSabund = sabund; + + //we want the summary results for the whole dataset, then the subsampling + if ((thisIter > 0) && subsample) { //subsample sabund and run it + //copy sabund since getSample destroys it + RAbundVector rabund = sabund->getRAbundVector(); + SAbundVector* newSabund = new SAbundVector(); + *newSabund = rabund.getSAbundVector(); + + sample.getSample(newSabund, subsampleSize); + thisIterSabund = newSabund; + } + + for(int i=0;i data = sumCalculators[i]->getValues(thisIterSabund); + + if (m->control_pressed) { return 0; } + + if (thisIter == 0) { + outputFileHandle << '\t'; + sumCalculators[i]->print(outputFileHandle); + }else { + //some of the calc have hci and lci need to make room for that + if (results[i].size() == 0) { results[i].resize(data.size()); } + //save results for ave and std. + for (int j = 0; j < data.size(); j++) { + if (m->control_pressed) { return 0; } + results[i][j].push_back(data[j]); + } + } + } + + //cleanup memory + if ((thisIter > 0) && subsample) { delete thisIterSabund; } + } + outputFileHandle << endl; + + if (subsample) { + outAve << sabund->getLabel() << '\t'; outStd << sabund->getLabel() << '\t'; + //find ave and std for this label and output + //will need to modify the createGroupSummary to combine results and not mess with the .summary file. + + //calcs -> values + vector< vector > calcAverages; calcAverages.resize(sumCalculators.size()); + for (int i = 0; i < calcAverages.size(); i++) { calcAverages[i].resize(results[i].size(), 0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j] += results[i][j][thisIter]; + } + } + } + + for (int i = 0; i < calcAverages.size(); i++) { //finds average. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j] /= (float) iters; + outAve << calcAverages[i][j] << '\t'; + } + } + + //find standard deviation + vector< vector > stdDev; stdDev.resize(sumCalculators.size()); + for (int i = 0; i < stdDev.size(); i++) { stdDev[i].resize(results[i].size(), 0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdDev.size(); i++) { + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j] += ((results[i][j][thisIter] - calcAverages[i][j]) * (results[i][j][thisIter] - calcAverages[i][j])); + } + } + } + + for (int i = 0; i < stdDev.size(); i++) { //finds average. + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j] /= (float) iters; + stdDev[i][j] = sqrt(stdDev[i][j]); + outStd << stdDev[i][j] << '\t'; + } + } + outAve << endl; outStd << endl; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SummaryCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** vector SummaryCommand::parseSharedFile(string filename) { try { vector filenames; @@ -496,12 +624,44 @@ vector SummaryCommand::parseSharedFile(string filename) { string sharedFileRoot = m->getRootName(filename); - //clears file before we start to write to it below + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = lookup[0]->getNumSeqs(); + for (int i = 1; i < lookup.size(); i++) { + int thisSize = lookup[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + m->clearGroups(); + vector Groups; + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + m->setGroups(Groups); + } + + if (lookup.size() < 1) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return filenames; } + } + + + /******************************************************/ + + //clears file before we start to write to it below for (int i=0; imothurRemove((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); } - + ofstream* temp; for (int i=0; i SummaryCommand::parseSharedFile(string filename) { } } //********************************************************************************************************************** -string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector& outputNames) { +vector SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector& outputNames, map groupIndex) { try { - - ofstream out; - string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups.summary"; - - //open combined file - m->openOutputFile(combineFileName, out); - + //open each groups summary file + vector newComboNames; string newLabel = ""; - map > files; + map > > files; for (int i=0; igetExtension(outputNames[i]); + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + m->mothurRemove(combineFileName); //remove old file + vector thisFilesLines; - + ifstream temp; m->openInputFile(outputNames[i], temp); @@ -578,7 +737,7 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector< temp >> tempLabel; //save for later - if (j == 1) { thisLine += groups[i] + "\t" + tempLabel + "\t"; } + if (j == 1) { thisLine += groupIndex[outputNames[i]] + "\t" + tempLabel + "\t"; } else{ thisLine += tempLabel + "\t"; } } @@ -588,31 +747,52 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector< m->gobble(temp); } - - files[outputNames[i]] = thisFilesLines; + + map > >::iterator itFiles = files.find(extension); + if (itFiles != files.end()) { //add new files info to existing type + files[extension][outputNames[i]] = thisFilesLines; + }else { + map > thisFile; + thisFile[outputNames[i]] = thisFilesLines; + files[extension] = thisFile; + } temp.close(); m->mothurRemove(outputNames[i]); } - //output label line to new file - out << newLabel << endl; - - //for each label - for (int k = 0; k < numLines; k++) { - - //grab summary data for each group - for (int i=0; i > >::iterator itFiles = files.begin(); itFiles != files.end(); itFiles++) { + + if (m->control_pressed) { break; } + + string extension = itFiles->first; + map > thisType = itFiles->second; + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + newComboNames.push_back(combineFileName); + //open combined file + ofstream out; + m->openOutputFile(combineFileName, out); + + //output label line to new file + out << newLabel << endl; + + //for each label + for (int k = 0; k < numLines; k++) { + + //grab summary data for each group + for (map >::iterator itType = thisType.begin(); itType != thisType.end(); itType++) { + out << (itType->second)[k]; + } + } + + outputNames.clear(); + + out.close(); + } //return combine file name - return combineFileName; + return newComboNames; } catch(exception& e) { diff --git a/summarycommand.h b/summarycommand.h index 15b7f40..1d93a33 100644 --- a/summarycommand.h +++ b/summarycommand.h @@ -37,9 +37,9 @@ private: vector sumCalculators; InputData* input; SAbundVector* sabund; - int abund, size; + int abund, size, iters, subsampleSize; - bool abort, allLines, groupMode; + bool abort, allLines, groupMode, subsample; set labels; //holds labels to be used string label, calc, outputDir, sharedfile, listfile, rabundfile, sabundfile, format, inputfile; vector Estimators; @@ -47,7 +47,8 @@ private: vector groups; vector parseSharedFile(string); - string createGroupSummaryFile(int, int, vector&); + vector createGroupSummaryFile(int, int, vector&, map); + int process(SAbundVector*&, ofstream&, ofstream&, ofstream&); }; diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index 77961f1..ed02804 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -365,11 +365,38 @@ int SummarySharedCommand::execute(){ return 0; } /******************************************************/ - + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = lookup[0]->getNumSeqs(); + for (int i = 1; i < lookup.size(); i++) { + int thisSize = lookup[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + m->clearGroups(); + Groups.clear(); + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + m->setGroups(Groups); + } + + if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return 0; } + } + /******************************************************/ //comparison breakup to be used by different processes later - numGroups = m->getNumGroups(); + numGroups = lookup.size(); lines.resize(processors); for (int i = 0; i < processors; i++) { lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups); @@ -527,11 +554,11 @@ int SummarySharedCommand::process(vector thisLookup, string vector< vector< vector > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files vector< vector > calcDists; calcDists.resize(sumCalculators.size()); - for (int thisIter = 0; thisIter < iters; thisIter++) { + for (int thisIter = 0; thisIter < iters+1; thisIter++) { vector thisItersLookup = thisLookup; - if (subsample) { + if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling SubSample sample; vector tempLabels; //dont need since we arent printing the sampled sharedRabunds @@ -710,14 +737,44 @@ int SummarySharedCommand::process(vector thisLookup, string #endif } - calcDistsTotals.push_back(calcDists); - if (subsample) { + if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling + calcDistsTotals.push_back(calcDists); //clean up memory for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } thisItersLookup.clear(); for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); } + }else { + if (createPhylip) { + for (int i = 0; i < calcDists.size(); i++) { + if (m->control_pressed) { break; } + + //initialize matrix + vector< vector > matrix; //square matrix to represent the distance + matrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } + + for (int j = 0; j < calcDists[i].size(); j++) { + int row = calcDists[i][j].seq1; + int column = calcDists[i][j].seq2; + double dist = calcDists[i][j].dist; + + matrix[row][column] = dist; + matrix[column][row] = dist; + } + + string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; + outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); + ofstream outDist; + m->openOutputFile(distFileName, outDist); + outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); + + printSims(outDist, matrix); + + outDist.close(); + } + } } } @@ -820,38 +877,8 @@ int SummarySharedCommand::process(vector thisLookup, string outSTD.close(); } - }else { - if (createPhylip) { - for (int i = 0; i < calcDists.size(); i++) { - if (m->control_pressed) { break; } - - //initialize matrix - vector< vector > matrix; //square matrix to represent the distance - matrix.resize(thisLookup.size()); - for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } - - for (int j = 0; j < calcDists[i].size(); j++) { - int row = calcDists[i][j].seq1; - int column = calcDists[i][j].seq2; - double dist = calcDists[i][j].dist; - - matrix[row][column] = dist; - matrix[column][row] = dist; - } - - string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; - outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); - ofstream outDist; - m->openOutputFile(distFileName, outDist); - outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); - - printSims(outDist, matrix); - - outDist.close(); - } - } } - + return 0; } catch(exception& e) {