From 53171f07cc0c0e560e2b4ba2946f690d59fc2dc4 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 3 Apr 2012 08:25:26 -0400 Subject: [PATCH] working on adding subsampling to dist.shared. fixed bug in phylotype command related to "unknown" addition. --- Mothur.xcodeproj/project.pbxproj | 6 + matrixoutputcommand.cpp | 455 +++++++++++++++++++++---------- matrixoutputcommand.h | 4 +- mothurout.cpp | 2 +- nseqs.h | 2 +- phylotree.cpp | 13 +- sharedace.cpp | 2 +- sharedanderbergs.cpp | 2 +- sharedbraycurtis.cpp | 2 +- sharedchao1.cpp | 2 +- sharedjclass.cpp | 2 +- sharedkulczynski.cpp | 2 +- sharedkulczynskicody.cpp | 2 +- sharedlennon.cpp | 2 +- sharedmorisitahorn.cpp | 4 +- sharedochiai.cpp | 2 +- sharedsobs.cpp | 2 +- sharedsobscollectsummary.cpp | 2 +- sharedsorclass.cpp | 2 +- sharedthetan.cpp | 4 +- sharedthetayc.cpp | 4 +- subsample.cpp | 192 +++++++++++++ subsample.h | 35 +++ subsamplecommand.cpp | 123 +-------- subsamplecommand.h | 1 - uvest.cpp | 2 +- 26 files changed, 588 insertions(+), 283 deletions(-) create mode 100644 subsample.cpp create mode 100644 subsample.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index f904a8b..2088e37 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -52,6 +52,7 @@ A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; }; A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; }; A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */; }; + A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7876A25152A017C00A0AE86 /* subsample.cpp */; }; A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; }; A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; }; A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; }; @@ -474,6 +475,8 @@ A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = ""; }; A77EBD2C1523707F00ED407C /* createdatabasecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = createdatabasecommand.h; sourceTree = ""; }; A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = createdatabasecommand.cpp; sourceTree = ""; }; + A7876A25152A017C00A0AE86 /* subsample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsample.cpp; sourceTree = ""; }; + A7876A28152A018B00A0AE86 /* subsample.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = subsample.h; sourceTree = ""; }; A79234D513C74BF6002B08E2 /* mothurfisher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurfisher.h; sourceTree = ""; }; A79234D613C74BF6002B08E2 /* mothurfisher.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurfisher.cpp; sourceTree = ""; }; A795840B13F13CD900F201D5 /* countgroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countgroupscommand.h; sourceTree = ""; }; @@ -1191,6 +1194,8 @@ A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */, A7E9B83012D37EC400DA6239 /* slibshuff.cpp */, A7E9B83112D37EC400DA6239 /* slibshuff.h */, + A7876A28152A018B00A0AE86 /* subsample.h */, + A7876A25152A017C00A0AE86 /* subsample.cpp */, A7C3DC0E14FE469500FE1924 /* trialswap2.h */, A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */, A7FF19F0140FFDA500AD216D /* trimoligos.h */, @@ -2301,6 +2306,7 @@ A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */, A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */, A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */, + A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index 5bfec37..cada2f1 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -8,17 +8,20 @@ */ #include "matrixoutputcommand.h" +#include "subsample.h" //********************************************************************************************************************** vector MatrixOutputCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); + CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc); CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); vector myArray; @@ -35,9 +38,11 @@ string MatrixOutputCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The dist.shared command parameters are shared, groups, calc, output, processors and label. shared is a required, unless you have a valid current file.\n"; + helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters and label. shared is a required, unless you have a valid current file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n"; helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n"; + helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n"; helpString += "The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n"; helpString += "The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n"; helpString += "Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n"; @@ -60,6 +65,7 @@ MatrixOutputCommand::MatrixOutputCommand(){ setParameters(); vector tempOutNames; outputTypes["phylip"] = tempOutNames; + outputTypes["subsample"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "MatrixOutputCommand", "MatrixOutputCommand"); @@ -94,6 +100,7 @@ MatrixOutputCommand::MatrixOutputCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["phylip"] = tempOutNames; + outputTypes["subsample"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -158,7 +165,19 @@ MatrixOutputCommand::MatrixOutputCommand(string option) { //remove citation from list of calcs for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") { Estimators.erase(Estimators.begin()+i); break; } } } - + + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 1; } + if (abort == false) { ValidCalculators validCalculator; @@ -288,6 +307,32 @@ int MatrixOutputCommand::execute(){ lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups); lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups); } + + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = lookup[0]->getNumSeqs(); + for (int i = 1; i < lookup.size(); i++) { + int thisSize = lookup[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + m->clearGroups(); + Groups.clear(); + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + m->setGroups(Groups); + } + } if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } m->clearGroups(); return 0; } @@ -416,167 +461,285 @@ int MatrixOutputCommand::process(vector thisLookup){ try { EstOutput data; vector subset; - vector< vector > calcDists; calcDists.resize(matrixCalculators.size()); //one for each calc, this will be used to make .dist files - + vector< vector< vector > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files + + vector< vector > calcDists; calcDists.resize(matrixCalculators.size()); - if(processors == 1){ - driver(thisLookup, 0, numGroups, calcDists); - }else{ - int process = 1; - vector processIDS; + for (int thisIter = 0; thisIter < iters; thisIter++) { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - driver(thisLookup, lines[process].start, lines[process].end, calcDists); - - string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist"; - ofstream outtemp; - m->openOutputFile(tempdistFileName, outtemp); - - for (int i = 0; i < calcDists.size(); i++) { - outtemp << calcDists[i].size() << endl; - - for (int j = 0; j < calcDists[i].size(); j++) { - outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl; - } - } - outtemp.close(); - - exit(0); - }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); - for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } - exit(0); - } - } - - //parent do your part - driver(thisLookup, lines[0].start, lines[0].end, calcDists); - - //force parent to wait until all the processes are done - for (int i = 0; i < processIDS.size(); i++) { - int temp = processIDS[i]; - wait(&temp); - } - - for (int i = 0; i < processIDS.size(); i++) { - string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist"; - ifstream intemp; - m->openInputFile(tempdistFileName, intemp); - - for (int k = 0; k < calcDists.size(); k++) { - int size = 0; - intemp >> size; m->gobble(intemp); - - for (int j = 0; j < size; j++) { - int seq1 = 0; - int seq2 = 0; - float dist = 1.0; - - intemp >> seq1 >> seq2 >> dist; m->gobble(intemp); - - seqDist tempDist(seq1, seq2, dist); - calcDists[k].push_back(tempDist); - } - } - intemp.close(); - m->mothurRemove(tempdistFileName); - } - #else - ////////////////////////////////////////////////////////////////////////////////////////////////////// - //Windows version shared memory, so be careful when passing variables through the distSharedData struct. - //Above fork() will clone, so memory is separate, but that's not the case with windows, - //Taking advantage of shared memory to pass results vectors. - ////////////////////////////////////////////////////////////////////////////////////////////////////// - - vector pDataArray; - DWORD dwThreadIdArray[processors-1]; - HANDLE hThreadArray[processors-1]; + vector thisItersLookup = thisLookup; - //Create processor worker threads. - for( int i=1; i tempLabels; //dont need since we arent printing the sampled sharedRabunds + thisItersLookup = sample.getSamplePreserve(thisLookup, tempLabels, subsampleSize); + } + cout << thisIter << endl; + if(processors == 1){ + driver(thisItersLookup, 0, numGroups, calcDists); + }else{ + int process = 1; + vector processIDS; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + + driver(thisItersLookup, lines[process].start, lines[process].end, calcDists); + + string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist"; + ofstream outtemp; + m->openOutputFile(tempdistFileName, outtemp); + + for (int i = 0; i < calcDists.size(); i++) { + outtemp << calcDists[i].size() << endl; + + for (int j = 0; j < calcDists[i].size(); j++) { + outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl; + } + } + outtemp.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } - //make copy of lookup so we don't get access violations - vector newLookup; - for (int k = 0; k < thisLookup.size(); k++) { - SharedRAbundVector* temp = new SharedRAbundVector(); - temp->setLabel(thisLookup[k]->getLabel()); - temp->setGroup(thisLookup[k]->getGroup()); - newLookup.push_back(temp); + //parent do your part + driver(thisItersLookup, lines[0].start, lines[0].end, calcDists); + + //force parent to wait until all the processes are done + for (int i = 0; i < processIDS.size(); i++) { + int temp = processIDS[i]; + wait(&temp); } - //for each bin - for (int k = 0; k < thisLookup[0]->getNumBins(); k++) { - if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } - for (int j = 0; j < thisLookup.size(); j++) { newLookup[j]->push_back(thisLookup[j]->getAbundance(k), thisLookup[j]->getGroup()); } + for (int i = 0; i < processIDS.size(); i++) { + string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist"; + ifstream intemp; + m->openInputFile(tempdistFileName, intemp); + + for (int k = 0; k < calcDists.size(); k++) { + int size = 0; + intemp >> size; m->gobble(intemp); + + for (int j = 0; j < size; j++) { + int seq1 = 0; + int seq2 = 0; + float dist = 1.0; + + intemp >> seq1 >> seq2 >> dist; m->gobble(intemp); + + seqDist tempDist(seq1, seq2, dist); + calcDists[k].push_back(tempDist); + } + } + intemp.close(); + m->mothurRemove(tempdistFileName); } + #else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the distSharedData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to pass results vectors. + ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Allocate memory for thread data. - distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup); - pDataArray.push_back(tempSum); - processIDS.push_back(i); + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; - hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); + //Create processor worker threads. + for( int i=1; i newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + // Allocate memory for thread data. + distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup); + pDataArray.push_back(tempSum); + processIDS.push_back(i); + + hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); + } + + //parent do your part + driver(thisItersLookup, lines[0].start, lines[0].end, calcDists); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; } + + for (int k = 0; k < calcDists.size(); k++) { + int size = pDataArray[i]->calcDists[k].size(); + for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); } + } + + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + #endif } - //parent do your part - driver(thisLookup, lines[0].start, lines[0].end, calcDists); - - //Wait until all threads have terminated. - WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + calcDistsTotals.push_back(calcDists); - //Close all thread handles and free memory allocations. - for(int i=0; i < pDataArray.size(); i++){ - for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; } + if (subsample) { + //clean up memory + // for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + // thisItersLookup.clear(); + } + } + + if (iters != 1) { + //we need to find the average distance and standard deviation for each groups distance + + vector< vector > calcAverages; calcAverages.resize(matrixCalculators.size()); + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + calcAverages[i].resize(calcDists[i].size()); - for (int k = 0; k < calcDists.size(); k++) { - int size = pDataArray[i]->calcDists[k].size(); - for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); } + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].seq1 = calcDists[i][j].seq1; + calcAverages[i][j].seq2 = calcDists[i][j].seq2; + calcAverages[i][j].dist = 0.0; + } + } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; + } + } + } + + for (int i = 0; i < calcAverages.size(); i++) { //finds average. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist /= (float) iters; } + } + + //find standard deviation + vector< vector > stdDev; stdDev.resize(matrixCalculators.size()); + for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero. + stdDev[i].resize(calcDists[i].size()); - CloseHandle(hThreadArray[i]); - delete pDataArray[i]; + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].seq1 = calcDists[i][j].seq1; + stdDev[i][j].seq2 = calcDists[i][j].seq2; + stdDev[i][j].dist = 0.0; + } + } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdDev.size(); i++) { + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist)); + } + } } - #endif - } + for (int i = 0; i < stdDev.size(); i++) { //finds average. + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist /= (float) iters; + stdDev[i][j].dist = sqrt(stdDev[i][j].dist); + } + } + + //print results + for (int i = 0; i < calcDists.size(); i++) { + vector< vector > matrix; //square matrix to represent the distance + matrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } + + vector< vector > stdmatrix; //square matrix to represent the stdDev + stdmatrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { stdmatrix[k].resize(thisLookup.size(), 0.0); } + + + for (int j = 0; j < calcAverages[i].size(); j++) { + int row = calcAverages[i][j].seq1; + int column = calcAverages[i][j].seq2; + float dist = calcAverages[i][j].dist; + float stdDist = stdDev[i][j].dist; + + matrix[row][column] = dist; + matrix[column][row] = dist; + stdmatrix[row][column] = stdDist; + stdmatrix[column][row] = stdDist; + } + + string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".results"; + outputNames.push_back(distFileName); outputTypes["subsample"].push_back(distFileName); + ofstream outDist; + m->openOutputFile(distFileName, outDist); + outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); + + outDist << "Group1\tGroup2\tAverageDist\tStdDev\n"; + for (int m = 0; m < matrix.size(); m++) { + for (int n = 0; n < m; n++) { + outDist << lookup[m]->getGroup() << '\t' << lookup[n]->getGroup() << '\t'; + outDist << matrix[m][n] << '\t' << stdmatrix[m][n] << endl; + } + } + outDist.close(); + } + + //output averages as distance matrix + calcDists = calcAverages; + } + + for (int i = 0; i < calcDists.size(); i++) { + if (m->control_pressed) { break; } + + //initialize matrix + vector< vector > matrix; //square matrix to represent the distance + matrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } + + for (int j = 0; j < calcDists[i].size(); j++) { + int row = calcDists[i][j].seq1; + int column = calcDists[i][j].seq2; + float dist = calcDists[i][j].dist; + + matrix[row][column] = dist; + matrix[column][row] = dist; + } + + string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; + outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); + ofstream outDist; + m->openOutputFile(distFileName, outDist); + outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); + + printSims(outDist, matrix); + + outDist.close(); + } - - - for (int i = 0; i < calcDists.size(); i++) { - if (m->control_pressed) { break; } - - //initialize matrix - vector< vector > matrix; //square matrix to represent the distance - matrix.resize(thisLookup.size()); - for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } - - for (int j = 0; j < calcDists[i].size(); j++) { - int row = calcDists[i][j].seq1; - int column = calcDists[i][j].seq2; - float dist = calcDists[i][j].dist; - - matrix[row][column] = dist; - matrix[column][row] = dist; - } - - string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; - outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName); - ofstream outDist; - m->openOutputFile(distFileName, outDist); - outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); - - printSims(outDist, matrix); - - outDist.close(); - } return 0; } diff --git a/matrixoutputcommand.h b/matrixoutputcommand.h index dc77bdf..f915dfc 100644 --- a/matrixoutputcommand.h +++ b/matrixoutputcommand.h @@ -96,10 +96,10 @@ private: InputData* input; vector lookup; string exportFileName, output, sharedfile; - int numGroups, processors; + int numGroups, processors, iters, subsampleSize; ofstream out; - bool abort, allLines; + bool abort, allLines, subsample; set labels; //holds labels to be used string outputFile, calc, groups, label, outputDir; vector Estimators, Groups, outputNames; //holds estimators to be used diff --git a/mothurout.cpp b/mothurout.cpp index 4df5f96..98f5ce0 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -621,7 +621,7 @@ string MothurOut::hasPath(string longName){ string MothurOut::getExtension(string longName){ try { - string extension = longName; + string extension = ""; if(longName.find_last_of('.') != longName.npos){ int pos = longName.find_last_of('.'); diff --git a/nseqs.h b/nseqs.h index e82684b..c0f9549 100644 --- a/nseqs.h +++ b/nseqs.h @@ -31,7 +31,7 @@ public: int numGroups = shared.size(); data.clear(); data.resize(numGroups,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //get bin values and set sharedByAll bool sharedByAll = true; for (int j = 0; j < numGroups; j++) { diff --git a/phylotree.cpp b/phylotree.cpp index dba1e3b..a9ef6cb 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -128,8 +128,6 @@ PhyloTree::PhyloTree(string tfile){ maxLevel = 0; calcTotals = true; string name, tax; - addSeqToTree("unknown", "unknown;"); - #ifdef USE_MPI int pid, num, processors; @@ -193,7 +191,16 @@ PhyloTree::PhyloTree(string tfile){ #endif assignHeirarchyIDs(0); - + + + string unknownTax = "unknown;"; + //added last taxon until you get desired level + for (int i = 1; i < maxLevel; i++) { + unknownTax += "unclassfied;"; + } + + addSeqToTree("unknown", unknownTax); + //create file for summary if needed setUp(tfile); } diff --git a/sharedace.cpp b/sharedace.cpp index c8ba4fa..2a06380 100644 --- a/sharedace.cpp +++ b/sharedace.cpp @@ -32,7 +32,7 @@ EstOutput SharedAce::getValues(vector shared) { S12 = number of shared OTUs in A and B This estimator was changed to reflect Caldwell's changes, eliminating the nrare / nrare - 1 */ - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedanderbergs.cpp b/sharedanderbergs.cpp index 10dfbd5..cbb9d30 100644 --- a/sharedanderbergs.cpp +++ b/sharedanderbergs.cpp @@ -21,7 +21,7 @@ EstOutput Anderberg::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedbraycurtis.cpp b/sharedbraycurtis.cpp index 0182e13..3711ce7 100644 --- a/sharedbraycurtis.cpp +++ b/sharedbraycurtis.cpp @@ -24,7 +24,7 @@ EstOutput BrayCurtis::getValues(vector shared) { sumSharedAB = the sum of the minimum otus int all shared otus in AB. */ - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedchao1.cpp b/sharedchao1.cpp index d8cf60b..8d47ad2 100644 --- a/sharedchao1.cpp +++ b/sharedchao1.cpp @@ -29,7 +29,7 @@ EstOutput SharedChao1::getValues(vector shared){ //create and initialize trees to 0. initialTree(numGroups); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //get bin values and calc shared bool sharedByAll = true; temp.clear(); diff --git a/sharedjclass.cpp b/sharedjclass.cpp index ac3e94e..ed21335 100644 --- a/sharedjclass.cpp +++ b/sharedjclass.cpp @@ -21,7 +21,7 @@ EstOutput Jclass::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedkulczynski.cpp b/sharedkulczynski.cpp index 6aad5df..5c91ddb 100644 --- a/sharedkulczynski.cpp +++ b/sharedkulczynski.cpp @@ -21,7 +21,7 @@ EstOutput Kulczynski::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedkulczynskicody.cpp b/sharedkulczynskicody.cpp index de90252..8c8b7f7 100644 --- a/sharedkulczynskicody.cpp +++ b/sharedkulczynskicody.cpp @@ -21,7 +21,7 @@ EstOutput KulczynskiCody::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedlennon.cpp b/sharedlennon.cpp index f3ed5ec..5219275 100644 --- a/sharedlennon.cpp +++ b/sharedlennon.cpp @@ -21,7 +21,7 @@ EstOutput Lennon::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedmorisitahorn.cpp b/sharedmorisitahorn.cpp index bd4923f..16a7590 100644 --- a/sharedmorisitahorn.cpp +++ b/sharedmorisitahorn.cpp @@ -20,14 +20,14 @@ EstOutput MorHorn::getValues(vector shared) { morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0; //get the total values we need to calculate the theta denominator sums - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls Atotal += shared[0]->getAbundance(i); Btotal += shared[1]->getAbundance(i); } //calculate the denominator sums - for (int j = 0; j < shared[0]->size(); j++) { + for (int j = 0; j < shared[0]->getNumBins(); j++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(j); tempB = shared[1]->getAbundance(j); diff --git a/sharedochiai.cpp b/sharedochiai.cpp index 004e535..b49fa4a 100644 --- a/sharedochiai.cpp +++ b/sharedochiai.cpp @@ -21,7 +21,7 @@ EstOutput Ochiai::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedsobs.cpp b/sharedsobs.cpp index 36cacd6..f9b5edd 100644 --- a/sharedsobs.cpp +++ b/sharedsobs.cpp @@ -19,7 +19,7 @@ EstOutput SharedSobs::getValues(vector shared){ double observed = 0; //loop through the species in each group - for (int k = 0; k < shared[0]->size(); k++) { + for (int k = 0; k < shared[0]->getNumBins(); k++) { //if you have found a new species if (shared[0]->getAbundance(k) != 0) { observed++; } else if ((shared[0]->getAbundance(k) == 0) && (shared[1]->getAbundance(k) != 0)) { observed++; } diff --git a/sharedsobscollectsummary.cpp b/sharedsobscollectsummary.cpp index e2e169c..fffed02 100644 --- a/sharedsobscollectsummary.cpp +++ b/sharedsobscollectsummary.cpp @@ -19,7 +19,7 @@ EstOutput SharedSobsCS::getValues(vector shared){ double observed = 0; int numGroups = shared.size(); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //get bin values and set sharedByAll bool sharedByAll = true; for (int j = 0; j < numGroups; j++) { diff --git a/sharedsorclass.cpp b/sharedsorclass.cpp index 32728f5..85609da 100644 --- a/sharedsorclass.cpp +++ b/sharedsorclass.cpp @@ -21,7 +21,7 @@ EstOutput SorClass::getValues(vector shared) { data.resize(1,0); - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); diff --git a/sharedthetan.cpp b/sharedthetan.cpp index 7c42a18..644adee 100644 --- a/sharedthetan.cpp +++ b/sharedthetan.cpp @@ -20,14 +20,14 @@ EstOutput ThetaN::getValues(vector shared) { numerator = 0.0; denominator = 0.0; thetaN = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0; //get the total values we need to calculate the theta denominator sums - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls Atotal += shared[0]->getAbundance(i); Btotal += shared[1]->getAbundance(i); } //calculate the theta denominator sums - for (int j = 0; j < shared[0]->size(); j++) { + for (int j = 0; j < shared[0]->getNumBins(); j++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(j); tempB = shared[1]->getAbundance(j); diff --git a/sharedthetayc.cpp b/sharedthetayc.cpp index 315a61f..6c0f6c7 100644 --- a/sharedthetayc.cpp +++ b/sharedthetayc.cpp @@ -29,14 +29,14 @@ EstOutput ThetaYC::getValues(vector shared) { double sumPsqQ = 0; //get the total values we need to calculate the theta denominator sums - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls Atotal += (double)shared[0]->getAbundance(i); Btotal += (double)shared[1]->getAbundance(i); } //calculate the theta denominator sums - for (int j = 0; j < shared[0]->size(); j++) { + for (int j = 0; j < shared[0]->getNumBins(); j++) { //store in temps to avoid multiple repetitive function calls pi = shared[0]->getAbundance(j) / Atotal; qi = shared[1]->getAbundance(j) / Btotal; diff --git a/subsample.cpp b/subsample.cpp new file mode 100644 index 0000000..d5b4e3e --- /dev/null +++ b/subsample.cpp @@ -0,0 +1,192 @@ +// +// subsample.cpp +// Mothur +// +// Created by Sarah Westcott on 4/2/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "subsample.h" + +//********************************************************************************************************************** +vector SubSample::getSamplePreserve(vector& thislookup, vector& newLabels, int size) { + try { + + vector newlookup; newlookup.resize(thislookup.size(), NULL); + + //save mothurOut's binLabels to restore for next label + vector saveBinLabels = m->currentBinLabels; + + int numBins = thislookup[0]->getNumBins(); + for (int i = 0; i < thislookup.size(); i++) { + int thisSize = thislookup[i]->getNumSeqs(); + + if (thisSize != size) { + + string thisgroup = thislookup[i]->getGroup(); + + OrderVector order; + for(int p=0;pgetAbundance(p);j++){ + order.push_back(p); + } + } + random_shuffle(order.begin(), order.end()); + + SharedRAbundVector* temp = new SharedRAbundVector(numBins); + temp->setLabel(thislookup[i]->getLabel()); + temp->setGroup(thislookup[i]->getGroup()); + + newlookup[i] = temp; + + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return newlookup; } + + int bin = order.get(j); + + int abund = newlookup[i]->getAbundance(bin); + newlookup[i]->set(bin, (abund+1), thisgroup); + } + } + } + + //subsampling may have created some otus with no sequences in them + eliminateZeroOTUS(newlookup); + + if (m->control_pressed) { return newlookup; } + + //save mothurOut's binLabels to restore for next label + newLabels = m->currentBinLabels; + m->currentBinLabels = saveBinLabels; + + return newlookup; + + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSamplePreserve"); + exit(1); + } +} +//********************************************************************************************************************** +vector SubSample::getSample(vector& thislookup, int size) { + try { + + //save mothurOut's binLabels to restore for next label + vector saveBinLabels = m->currentBinLabels; + + int numBins = thislookup[0]->getNumBins(); + for (int i = 0; i < thislookup.size(); i++) { + int thisSize = thislookup[i]->getNumSeqs(); + + if (thisSize != size) { + + string thisgroup = thislookup[i]->getGroup(); + + OrderVector order; + for(int p=0;pgetAbundance(p);j++){ + order.push_back(p); + } + } + random_shuffle(order.begin(), order.end()); + + SharedRAbundVector* temp = new SharedRAbundVector(numBins); + temp->setLabel(thislookup[i]->getLabel()); + temp->setGroup(thislookup[i]->getGroup()); + + delete thislookup[i]; + thislookup[i] = temp; + + + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return m->currentBinLabels; } + + int bin = order.get(j); + + int abund = thislookup[i]->getAbundance(bin); + thislookup[i]->set(bin, (abund+1), thisgroup); + } + } + } + + //subsampling may have created some otus with no sequences in them + eliminateZeroOTUS(thislookup); + + if (m->control_pressed) { return m->currentBinLabels; } + + //save mothurOut's binLabels to restore for next label + vector subsampleBinLabels = m->currentBinLabels; + m->currentBinLabels = saveBinLabels; + + return subsampleBinLabels; + + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample"); + exit(1); + } +} +//********************************************************************************************************************** +int SubSample::eliminateZeroOTUS(vector& thislookup) { + try { + + vector newLookup; + for (int i = 0; i < thislookup.size(); i++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thislookup[i]->getLabel()); + temp->setGroup(thislookup[i]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + vector newBinLabels; + string snumBins = toString(thislookup[0]->getNumBins()); + for (int i = 0; i < thislookup[0]->getNumBins(); i++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + + //look at each sharedRabund and make sure they are not all zero + bool allZero = true; + for (int j = 0; j < thislookup.size(); j++) { + if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; } + } + + //if they are not all zero add this bin + if (!allZero) { + for (int j = 0; j < thislookup.size(); j++) { + newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup()); + } + //if there is a bin label use it otherwise make one + string binLabel = "Otu"; + string sbinNumber = toString(i+1); + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { binLabel += "0"; } + } + binLabel += sbinNumber; + if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; } + + newBinLabels.push_back(binLabel); + } + } + + for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; } + thislookup.clear(); + + thislookup = newLookup; + m->currentBinLabels = newBinLabels; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSample", "eliminateZeroOTUS"); + exit(1); + } +} + + +//********************************************************************************************************************** + + diff --git a/subsample.h b/subsample.h new file mode 100644 index 0000000..9156e09 --- /dev/null +++ b/subsample.h @@ -0,0 +1,35 @@ +#ifndef Mothur_subsample_h +#define Mothur_subsample_h + +// +// subsample.h +// Mothur +// +// Created by Sarah Westcott on 4/2/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "mothurout.h" +#include "sharedrabundvector.h" + +//subsampling overwrites the sharedRabunds. If you need to reuse the original use the getSamplePreserve function. + +class SubSample { + + public: + + SubSample() { m = MothurOut::getInstance(); } + ~SubSample() {} + + vector getSample(vector&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. + + vector getSamplePreserve(vector&, vector&, int); + + private: + + MothurOut* m; + int eliminateZeroOTUS(vector&); + +}; + +#endif diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index d4e2c75..717b1d3 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -10,6 +10,7 @@ #include "subsamplecommand.h" #include "sharedutilities.h" #include "deconvolutecommand.h" +#include "subsample.h" //********************************************************************************************************************** vector SubSampleCommand::setParameters(){ @@ -801,68 +802,28 @@ int SubSampleCommand::processShared(vector& thislookup) { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile); - - - ofstream out; + + SubSample sample; + vector subsampledLabels = sample.getSample(thislookup, size); + + if (m->control_pressed) { return 0; } + + ofstream out; m->openOutputFile(outputFileName, out); outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - int numBins = thislookup[0]->getNumBins(); - for (int i = 0; i < thislookup.size(); i++) { - int thisSize = thislookup[i]->getNumSeqs(); - - if (thisSize != size) { - - string thisgroup = thislookup[i]->getGroup(); - - OrderVector* order = new OrderVector(); - for(int p=0;pgetAbundance(p);j++){ - order->push_back(p); - } - } - random_shuffle(order->begin(), order->end()); - - SharedRAbundVector* temp = new SharedRAbundVector(numBins); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - - delete thislookup[i]; - thislookup[i] = temp; - - - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { delete order; out.close(); return 0; } - - //get random number to sample from order between 0 and thisSize-1. - //don't need this because of the random shuffle above - //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(j); - - int abund = thislookup[i]->getAbundance(bin); - thislookup[i]->set(bin, (abund+1), thisgroup); - } - delete order; - } - } - - //subsampling may have created some otus with no sequences in them - eliminateZeroOTUS(thislookup); - - if (m->control_pressed) { out.close(); return 0; } - + m->currentBinLabels = subsampledLabels; + thislookup[0]->printHeaders(out); for (int i = 0; i < thislookup.size(); i++) { out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t'; thislookup[i]->print(out); } - out.close(); - - //save mothurOut's binLabels to restore for next label + + + //save mothurOut's binLabels to restore for next label m->currentBinLabels = saveBinLabels; return 0; @@ -1523,64 +1484,6 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { } } //********************************************************************************************************************** -int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) { - try { - - vector newLookup; - for (int i = 0; i < thislookup.size(); i++) { - SharedRAbundVector* temp = new SharedRAbundVector(); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - newLookup.push_back(temp); - } - - //for each bin - vector newBinLabels; - string snumBins = toString(thislookup[0]->getNumBins()); - for (int i = 0; i < thislookup[0]->getNumBins(); i++) { - if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } - - //look at each sharedRabund and make sure they are not all zero - bool allZero = true; - for (int j = 0; j < thislookup.size(); j++) { - if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; } - } - - //if they are not all zero add this bin - if (!allZero) { - for (int j = 0; j < thislookup.size(); j++) { - newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup()); - } - //if there is a bin label use it otherwise make one - string binLabel = "Otu"; - string sbinNumber = toString(i+1); - if (sbinNumber.length() < snumBins.length()) { - int diff = snumBins.length() - sbinNumber.length(); - for (int h = 0; h < diff; h++) { binLabel += "0"; } - } - binLabel += sbinNumber; - if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; } - - newBinLabels.push_back(binLabel); - } - } - - for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; } - thislookup.clear(); - - thislookup = newLookup; - m->currentBinLabels = newBinLabels; - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS"); - exit(1); - } -} - -//********************************************************************************************************************** diff --git a/subsamplecommand.h b/subsamplecommand.h index 4be3570..7235a7b 100644 --- a/subsamplecommand.h +++ b/subsamplecommand.h @@ -45,7 +45,6 @@ private: vector names; map > nameMap; - int eliminateZeroOTUS(vector&); int getSubSampleShared(); int getSubSampleList(); int getSubSampleRabund(); diff --git a/uvest.cpp b/uvest.cpp index 8aa1666..f0ca81e 100644 --- a/uvest.cpp +++ b/uvest.cpp @@ -29,7 +29,7 @@ EstOutput UVEst::getUVest(vector shared) { sumSharedA1 = the sum of all shared otus in A where B = 1 sumSharedB1 = the sum of all shared otus in B where A = 1 */ - for (int i = 0; i < shared[0]->size(); i++) { + for (int i = 0; i < shared[0]->getNumBins(); i++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(i); tempB = shared[1]->getAbundance(i); -- 2.39.2