]> git.donarmstrong.com Git - mothur.git/commitdiff
working on adding subsampling to dist.shared. fixed bug in phylotype command related...
authorSarah Westcott <mothur.westcott@gmail.com>
Tue, 3 Apr 2012 12:25:26 +0000 (08:25 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Tue, 3 Apr 2012 12:25:26 +0000 (08:25 -0400)
26 files changed:
Mothur.xcodeproj/project.pbxproj
matrixoutputcommand.cpp
matrixoutputcommand.h
mothurout.cpp
nseqs.h
phylotree.cpp
sharedace.cpp
sharedanderbergs.cpp
sharedbraycurtis.cpp
sharedchao1.cpp
sharedjclass.cpp
sharedkulczynski.cpp
sharedkulczynskicody.cpp
sharedlennon.cpp
sharedmorisitahorn.cpp
sharedochiai.cpp
sharedsobs.cpp
sharedsobscollectsummary.cpp
sharedsorclass.cpp
sharedthetan.cpp
sharedthetayc.cpp
subsample.cpp [new file with mode: 0644]
subsample.h [new file with mode: 0644]
subsamplecommand.cpp
subsamplecommand.h
uvest.cpp

index f904a8b33650f30c1432f5866481b22a749ee2a9..2088e3791a7a7602e0a8015c06606afad8dc5d52 100644 (file)
@@ -52,6 +52,7 @@
                A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
                A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
                A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */; };
+               A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7876A25152A017C00A0AE86 /* subsample.cpp */; };
                A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; };
                A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; };
                A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; };
                A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = "<group>"; };
                A77EBD2C1523707F00ED407C /* createdatabasecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = createdatabasecommand.h; sourceTree = "<group>"; };
                A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = createdatabasecommand.cpp; sourceTree = "<group>"; };
+               A7876A25152A017C00A0AE86 /* subsample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsample.cpp; sourceTree = "<group>"; };
+               A7876A28152A018B00A0AE86 /* subsample.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = subsample.h; sourceTree = "<group>"; };
                A79234D513C74BF6002B08E2 /* mothurfisher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurfisher.h; sourceTree = "<group>"; };
                A79234D613C74BF6002B08E2 /* mothurfisher.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurfisher.cpp; sourceTree = "<group>"; };
                A795840B13F13CD900F201D5 /* countgroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countgroupscommand.h; sourceTree = "<group>"; };
                                A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */,
                                A7E9B83012D37EC400DA6239 /* slibshuff.cpp */,
                                A7E9B83112D37EC400DA6239 /* slibshuff.h */,
+                               A7876A28152A018B00A0AE86 /* subsample.h */,
+                               A7876A25152A017C00A0AE86 /* subsample.cpp */,
                                A7C3DC0E14FE469500FE1924 /* trialswap2.h */,
                                A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */,
                                A7FF19F0140FFDA500AD216D /* trimoligos.h */,
                                A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */,
                                A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */,
                                A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */,
+                               A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 5bfec377de51d966de6d759161a99b43b7dccfdc..cada2f16035f6bfea109aa4112908b9bb8c128f3 100644 (file)
@@ -8,17 +8,20 @@
  */
 
 #include "matrixoutputcommand.h"
+#include "subsample.h"
 
 //**********************************************************************************************************************
 vector<string> MatrixOutputCommand::setParameters(){   
        try {
                CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
                CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+        CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
                CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
                CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+        CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
+        CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                
                vector<string> myArray;
@@ -35,9 +38,11 @@ string MatrixOutputCommand::getHelpString(){
        try {
                string helpString = "";
                ValidCalculators validCalculator;
-               helpString += "The dist.shared command parameters are shared, groups, calc, output, processors and label.  shared is a required, unless you have a valid current file.\n";
+               helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters and label.  shared is a required, unless you have a valid current file.\n";
                helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n";
                helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n";
+        helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
+        helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n";
                helpString += "The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n";
                helpString += "The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n";
                helpString += "Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n";
@@ -60,6 +65,7 @@ MatrixOutputCommand::MatrixOutputCommand(){
                setParameters();
                vector<string> tempOutNames;
                outputTypes["phylip"] = tempOutNames;
+        outputTypes["subsample"] = tempOutNames;
        }
        catch(exception& e) {
                m->errorOut(e, "MatrixOutputCommand", "MatrixOutputCommand");
@@ -94,6 +100,7 @@ MatrixOutputCommand::MatrixOutputCommand(string option)  {
                        //initialize outputTypes
                        vector<string> tempOutNames;
                        outputTypes["phylip"] = tempOutNames;
+            outputTypes["subsample"] = tempOutNames;
                        
                        //if the user changes the input directory command factory will send this info to us in the output parameter 
                        string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -158,7 +165,19 @@ MatrixOutputCommand::MatrixOutputCommand(string option)  {
                                //remove citation from list of calcs
                                for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") {  Estimators.erase(Estimators.begin()+i); break; } }
                        }
-
+            
+            temp = validParameter.validFile(parameters, "iters", false);                       if (temp == "not found") { temp = "1000"; }
+                       m->mothurConvert(temp, iters); 
+            
+            temp = validParameter.validFile(parameters, "subsample", false);           if (temp == "not found") { temp = "F"; }
+                       if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+            else {  
+                if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
+                else { subsample = false; }
+            }
+            
+            if (subsample == false) { iters = 1; }
+            
                        if (abort == false) {
                        
                                ValidCalculators validCalculator;
@@ -288,6 +307,32 @@ int MatrixOutputCommand::execute(){
                        lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
                        lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
                }       
+        
+        if (subsample) { 
+            if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+                subsampleSize = lookup[0]->getNumSeqs();
+                for (int i = 1; i < lookup.size(); i++) {
+                    int thisSize = lookup[i]->getNumSeqs();
+                    
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+            }else {
+                m->clearGroups();
+                Groups.clear();
+                vector<SharedRAbundVector*> temp;
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (lookup[i]->getNumSeqs() < subsampleSize) { 
+                        m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                        delete lookup[i];
+                    }else { 
+                        Groups.push_back(lookup[i]->getGroup()); 
+                        temp.push_back(lookup[i]);
+                    }
+                } 
+                lookup = temp;
+                m->setGroups(Groups);
+            }
+        }
                
                if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } m->clearGroups(); return 0;  }
                                
@@ -416,167 +461,285 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
        try {
                EstOutput data;
                vector<SharedRAbundVector*> subset;
-               vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size()); //one for each calc, this will be used to make .dist files
-               
+               vector< vector< vector<seqDist> > > calcDistsTotals;  //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
+
+        vector< vector<seqDist>  > calcDists; calcDists.resize(matrixCalculators.size());              
        
-               if(processors == 1){
-                       driver(thisLookup, 0, numGroups, calcDists);
-               }else{
-                       int process = 1;
-                       vector<int> processIDS;
+        for (int thisIter = 0; thisIter < iters; thisIter++) {
             
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                       //loop through and create all the processes you want
-                       while (process != processors) {
-                               int pid = fork();
-                               
-                               if (pid > 0) {
-                                       processIDS.push_back(pid); 
-                                       process++;
-                               }else if (pid == 0){
-                                       driver(thisLookup, lines[process].start, lines[process].end, calcDists);   
-                                       
-                                       string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
-                                       ofstream outtemp;
-                                       m->openOutputFile(tempdistFileName, outtemp);
-                                               
-                                       for (int i = 0; i < calcDists.size(); i++) {
-                                               outtemp << calcDists[i].size() << endl;
-                                                       
-                                               for (int j = 0; j < calcDists[i].size(); j++) {
-                                                       outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
-                                               }
-                                       }
-                                       outtemp.close();
-                                                                       
-                                       exit(0);
-                               }else { 
-                                       m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
-                                       for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
-                                       exit(0);
-                               }
-                       }
-                       
-                       //parent do your part
-                       driver(thisLookup, lines[0].start, lines[0].end, calcDists);   
-                                               
-                       //force parent to wait until all the processes are done
-                       for (int i = 0; i < processIDS.size(); i++) {
-                               int temp = processIDS[i];
-                               wait(&temp);
-                       }
-                       
-                       for (int i = 0; i < processIDS.size(); i++) {
-                               string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) +  ".dist";
-                               ifstream intemp;
-                               m->openInputFile(tempdistFileName, intemp);
-                                       
-                               for (int k = 0; k < calcDists.size(); k++) {
-                                       int size = 0;
-                                       intemp >> size; m->gobble(intemp);
-                                               
-                                       for (int j = 0; j < size; j++) {
-                                               int seq1 = 0;
-                                               int seq2 = 0;
-                                               float dist = 1.0;
-                                                       
-                                               intemp >> seq1 >> seq2 >> dist;   m->gobble(intemp);
-                                                       
-                                               seqDist tempDist(seq1, seq2, dist);
-                                               calcDists[k].push_back(tempDist);
-                                       }
-                               }
-                               intemp.close();
-                               m->mothurRemove(tempdistFileName);
-                       }
-            #else
-            //////////////////////////////////////////////////////////////////////////////////////////////////////
-            //Windows version shared memory, so be careful when passing variables through the distSharedData struct. 
-            //Above fork() will clone, so memory is separate, but that's not the case with windows, 
-            //Taking advantage of shared memory to pass results vectors.
-            //////////////////////////////////////////////////////////////////////////////////////////////////////
-            
-            vector<distSharedData*> pDataArray; 
-            DWORD   dwThreadIdArray[processors-1];
-            HANDLE  hThreadArray[processors-1]; 
+            vector<SharedRAbundVector*> thisItersLookup = thisLookup;
             
-            //Create processor worker threads.
-            for( int i=1; i<processors; i++ ){
+            if (subsample) {
+                SubSample sample;
+                vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
+                thisItersLookup = sample.getSamplePreserve(thisLookup, tempLabels, subsampleSize);
+            }
+            cout << thisIter << endl;
+            if(processors == 1){
+                driver(thisItersLookup, 0, numGroups, calcDists);
+            }else{
+                int process = 1;
+                vector<int> processIDS;
+                
+                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+                //loop through and create all the processes you want
+                while (process != processors) {
+                    int pid = fork();
+                    
+                    if (pid > 0) {
+                        processIDS.push_back(pid); 
+                        process++;
+                    }else if (pid == 0){
+                        
+                        driver(thisItersLookup, lines[process].start, lines[process].end, calcDists);   
+                        
+                        string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
+                        ofstream outtemp;
+                        m->openOutputFile(tempdistFileName, outtemp);
+                            
+                        for (int i = 0; i < calcDists.size(); i++) {
+                            outtemp << calcDists[i].size() << endl;
+                                
+                            for (int j = 0; j < calcDists[i].size(); j++) {
+                                outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
+                            }
+                        }
+                        outtemp.close();
+                                        
+                        exit(0);
+                    }else { 
+                        m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                        for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                        exit(0);
+                    }
+                }
                 
-                //make copy of lookup so we don't get access violations
-                vector<SharedRAbundVector*> newLookup;
-                for (int k = 0; k < thisLookup.size(); k++) {
-                    SharedRAbundVector* temp = new SharedRAbundVector();
-                    temp->setLabel(thisLookup[k]->getLabel());
-                    temp->setGroup(thisLookup[k]->getGroup());
-                    newLookup.push_back(temp);
+                //parent do your part
+                driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);   
+                            
+                //force parent to wait until all the processes are done
+                for (int i = 0; i < processIDS.size(); i++) {
+                    int temp = processIDS[i];
+                    wait(&temp);
                 }
                 
-                //for each bin
-                for (int k = 0; k < thisLookup[0]->getNumBins(); k++) {
-                    if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
-                    for (int j = 0; j < thisLookup.size(); j++) { newLookup[j]->push_back(thisLookup[j]->getAbundance(k), thisLookup[j]->getGroup()); }
+                for (int i = 0; i < processIDS.size(); i++) {
+                    string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) +  ".dist";
+                    ifstream intemp;
+                    m->openInputFile(tempdistFileName, intemp);
+                        
+                    for (int k = 0; k < calcDists.size(); k++) {
+                        int size = 0;
+                        intemp >> size; m->gobble(intemp);
+                            
+                        for (int j = 0; j < size; j++) {
+                            int seq1 = 0;
+                            int seq2 = 0;
+                            float dist = 1.0;
+                                
+                            intemp >> seq1 >> seq2 >> dist;   m->gobble(intemp);
+                                
+                            seqDist tempDist(seq1, seq2, dist);
+                            calcDists[k].push_back(tempDist);
+                        }
+                    }
+                    intemp.close();
+                    m->mothurRemove(tempdistFileName);
                 }
+                #else
+                //////////////////////////////////////////////////////////////////////////////////////////////////////
+                //Windows version shared memory, so be careful when passing variables through the distSharedData struct. 
+                //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+                //Taking advantage of shared memory to pass results vectors.
+                //////////////////////////////////////////////////////////////////////////////////////////////////////
                 
-                // Allocate memory for thread data.
-                distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup);
-                pDataArray.push_back(tempSum);
-                processIDS.push_back(i);
+                vector<distSharedData*> pDataArray; 
+                DWORD   dwThreadIdArray[processors-1];
+                HANDLE  hThreadArray[processors-1]; 
                 
-                hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
+                //Create processor worker threads.
+                for( int i=1; i<processors; i++ ){
+                    
+                    //make copy of lookup so we don't get access violations
+                    vector<SharedRAbundVector*> newLookup;
+                    for (int k = 0; k < thisItersLookup.size(); k++) {
+                        SharedRAbundVector* temp = new SharedRAbundVector();
+                        temp->setLabel(thisItersLookup[k]->getLabel());
+                        temp->setGroup(thisItersLookup[k]->getGroup());
+                        newLookup.push_back(temp);
+                    }
+                    
+                    //for each bin
+                    for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
+                        if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
+                        for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
+                    }
+                    
+                    // Allocate memory for thread data.
+                    distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup);
+                    pDataArray.push_back(tempSum);
+                    processIDS.push_back(i);
+                    
+                    hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
+                }
+                
+                //parent do your part
+                driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);   
+                           
+                //Wait until all threads have terminated.
+                WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+                
+                //Close all thread handles and free memory allocations.
+                for(int i=0; i < pDataArray.size(); i++){
+                    for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) {  delete pDataArray[i]->thisLookup[j];  } 
+                    
+                    for (int k = 0; k < calcDists.size(); k++) {
+                        int size = pDataArray[i]->calcDists[k].size();
+                        for (int j = 0; j < size; j++) {    calcDists[k].push_back(pDataArray[i]->calcDists[k][j]);    }
+                    }
+                    
+                    CloseHandle(hThreadArray[i]);
+                    delete pDataArray[i];
+                }
+
+                #endif
             }
             
-            //parent do your part
-            driver(thisLookup, lines[0].start, lines[0].end, calcDists);   
-                       
-            //Wait until all threads have terminated.
-            WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+            calcDistsTotals.push_back(calcDists);
             
-            //Close all thread handles and free memory allocations.
-            for(int i=0; i < pDataArray.size(); i++){
-                for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) {  delete pDataArray[i]->thisLookup[j];  } 
+            if (subsample) {  
+                //clean up memory
+               // for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
+               // thisItersLookup.clear();
+            }
+               }
+               
+        if (iters != 1) {
+            //we need to find the average distance and standard deviation for each groups distance
+            
+            vector< vector<seqDist>  > calcAverages; calcAverages.resize(matrixCalculators.size()); 
+            for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+                calcAverages[i].resize(calcDists[i].size());
                 
-                for (int k = 0; k < calcDists.size(); k++) {
-                    int size = pDataArray[i]->calcDists[k].size();
-                    for (int j = 0; j < size; j++) {    calcDists[k].push_back(pDataArray[i]->calcDists[k][j]);    }
+                for (int j = 0; j < calcAverages[i].size(); j++) {
+                    calcAverages[i][j].seq1 = calcDists[i][j].seq1;
+                    calcAverages[i][j].seq2 = calcDists[i][j].seq2;
+                    calcAverages[i][j].dist = 0.0;
+                }
+            }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+                for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+                    for (int j = 0; j < calcAverages[i].size(); j++) {
+                        calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+                    }
+                }
+            }
+            
+            for (int i = 0; i < calcAverages.size(); i++) {  //finds average.
+                for (int j = 0; j < calcAverages[i].size(); j++) {
+                    calcAverages[i][j].dist /= (float) iters;
                 }
+            }
+            
+            //find standard deviation
+            vector< vector<seqDist>  > stdDev; stdDev.resize(matrixCalculators.size());
+            for (int i = 0; i < stdDev.size(); i++) {  //initialize sums to zero.
+                stdDev[i].resize(calcDists[i].size());
                 
-                CloseHandle(hThreadArray[i]);
-                delete pDataArray[i];
+                for (int j = 0; j < stdDev[i].size(); j++) {
+                    stdDev[i][j].seq1 = calcDists[i][j].seq1;
+                    stdDev[i][j].seq2 = calcDists[i][j].seq2;
+                    stdDev[i][j].dist = 0.0;
+                }
+            }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+                for (int i = 0; i < stdDev.size(); i++) {  
+                    for (int j = 0; j < stdDev[i].size(); j++) {
+                        stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
+                    }
+                }
             }
 
-            #endif
-               }
+            for (int i = 0; i < stdDev.size(); i++) {  //finds average.
+                for (int j = 0; j < stdDev[i].size(); j++) {
+                    stdDev[i][j].dist /= (float) iters;
+                    stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
+                }
+            }
+            
+            //print results
+            for (int i = 0; i < calcDists.size(); i++) {
+                vector< vector<float> > matrix; //square matrix to represent the distance
+                matrix.resize(thisLookup.size());
+                for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+                
+                vector< vector<float> > stdmatrix; //square matrix to represent the stdDev
+                stdmatrix.resize(thisLookup.size());
+                for (int k = 0; k < thisLookup.size(); k++) {  stdmatrix[k].resize(thisLookup.size(), 0.0); }
+
+            
+                for (int j = 0; j < calcAverages[i].size(); j++) {
+                    int row = calcAverages[i][j].seq1;
+                    int column = calcAverages[i][j].seq2;
+                    float dist = calcAverages[i][j].dist;
+                    float stdDist = stdDev[i][j].dist;
+                    
+                    matrix[row][column] = dist;
+                    matrix[column][row] = dist;
+                    stdmatrix[row][column] = stdDist;
+                    stdmatrix[column][row] = stdDist;
+                }
+            
+                string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + ".results";
+                outputNames.push_back(distFileName); outputTypes["subsample"].push_back(distFileName);
+                ofstream outDist;
+                m->openOutputFile(distFileName, outDist);
+                outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+                
+                outDist << "Group1\tGroup2\tAverageDist\tStdDev\n";
+                for (int m = 0; m < matrix.size(); m++)        {
+                    for (int n = 0; n < m; n++)        {
+                        outDist << lookup[m]->getGroup() << '\t' <<  lookup[n]->getGroup() << '\t';
+                        outDist << matrix[m][n] << '\t' << stdmatrix[m][n] << endl; 
+                    }
+                }
+                outDist.close();
+            }
+            
+            //output averages as distance matrix
+            calcDists = calcAverages;
+        }
+        
+        for (int i = 0; i < calcDists.size(); i++) {
+            if (m->control_pressed) { break; }
+            
+            //initialize matrix
+            vector< vector<float> > matrix; //square matrix to represent the distance
+            matrix.resize(thisLookup.size());
+            for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+            
+            for (int j = 0; j < calcDists[i].size(); j++) {
+                int row = calcDists[i][j].seq1;
+                int column = calcDists[i][j].seq2;
+                float dist = calcDists[i][j].dist;
+                
+                matrix[row][column] = dist;
+                matrix[column][row] = dist;
+            }
+            
+            string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
+            outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+            ofstream outDist;
+            m->openOutputFile(distFileName, outDist);
+            outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+            
+            printSims(outDist, matrix);
+            
+            outDist.close();
+        }
 
-               
-               
-               for (int i = 0; i < calcDists.size(); i++) {
-                       if (m->control_pressed) { break; }
-                               
-                       //initialize matrix
-                       vector< vector<float> > matrix; //square matrix to represent the distance
-                       matrix.resize(thisLookup.size());
-                       for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
-                               
-                       for (int j = 0; j < calcDists[i].size(); j++) {
-                               int row = calcDists[i][j].seq1;
-                               int column = calcDists[i][j].seq2;
-                               float dist = calcDists[i][j].dist;
-                                       
-                               matrix[row][column] = dist;
-                               matrix[column][row] = dist;
-                       }
-                       
-                       string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
-                       outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
-                       ofstream outDist;
-                       m->openOutputFile(distFileName, outDist);
-                       outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-                       
-                       printSims(outDist, matrix);
-                       
-                       outDist.close();
-               }
                
                return 0;
        }
index dc77bdf067ff194dce8bd2e5fcb8f078a9450dcc..f915dfcda8d7ab88040fcb1e8e2f4bbabf833590 100644 (file)
@@ -96,10 +96,10 @@ private:
        InputData* input;
        vector<SharedRAbundVector*> lookup;
        string exportFileName, output, sharedfile;
-       int numGroups, processors;
+       int numGroups, processors, iters, subsampleSize;
        ofstream out;
 
-       bool abort, allLines;
+       bool abort, allLines, subsample;
        set<string> labels; //holds labels to be used
        string outputFile, calc, groups, label, outputDir;
        vector<string>  Estimators, Groups, outputNames; //holds estimators to be used
index 4df5f96eb6086e8d2e8e96e3f448948540a1c8b4..98f5ce09608855690a9fc7442814841a064db379 100644 (file)
@@ -621,7 +621,7 @@ string MothurOut::hasPath(string longName){
 
 string MothurOut::getExtension(string longName){
        try {
-               string extension = longName;
+               string extension = "";
                
                if(longName.find_last_of('.') != longName.npos){
                        int pos = longName.find_last_of('.');
diff --git a/nseqs.h b/nseqs.h
index e82684b645041a10095c681061d4b16fa6929a9f..c0f9549c87584bbf99e503388ff737c9f5343e8e 100644 (file)
--- a/nseqs.h
+++ b/nseqs.h
@@ -31,7 +31,7 @@ public:
                int numGroups = shared.size();
                data.clear(); data.resize(numGroups,0);
 
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //get bin values and set sharedByAll 
                        bool sharedByAll = true;
                        for (int j = 0; j < numGroups; j++) {
index dba1e3b56951f8ae1c484ace22187a9b65053d35..a9ef6cb0d85f44bde915606eb452db993ec40d8d 100644 (file)
@@ -128,8 +128,6 @@ PhyloTree::PhyloTree(string tfile){
                maxLevel = 0;
                calcTotals = true;
                string name, tax;
-               addSeqToTree("unknown", "unknown;");
-
                
                #ifdef USE_MPI
                        int pid, num, processors;
@@ -193,7 +191,16 @@ PhyloTree::PhyloTree(string tfile){
                #endif
        
                assignHeirarchyIDs(0);
-       
+        
+        
+        string unknownTax = "unknown;";
+        //added last taxon until you get desired level
+               for (int i = 1; i < maxLevel; i++) {
+                       unknownTax += "unclassfied;";
+               }
+        
+        addSeqToTree("unknown", unknownTax);
+        
                //create file for summary if needed
                setUp(tfile);
        }
index c8ba4fa2347735dc27640321c286384960ddb134..2a06380b6896ae6c908f38222abdcc566bcae077 100644 (file)
@@ -32,7 +32,7 @@ EstOutput SharedAce::getValues(vector<SharedRAbundVector*> shared) {
                S12 = number of shared OTUs in A and B
                This estimator was changed to reflect Caldwell's changes, eliminating the nrare / nrare - 1 */
 
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index 10dfbd544bedeafaa3a82c282174480e9b55d005..cbb9d3003ed84631e5251f9baf6738a165c83c7e 100644 (file)
@@ -21,7 +21,7 @@ EstOutput Anderberg::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index 0182e13bd34ea9164383b5f83c221ae705898677..3711ce77cbb312e5dc2ee2aba14e1856bb942d12 100644 (file)
@@ -24,7 +24,7 @@ EstOutput BrayCurtis::getValues(vector<SharedRAbundVector*> shared) {
                sumSharedAB = the sum of the minimum otus int all shared otus in AB.
                */
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index d8cf60bf0cfe67097bcdadc8025ffc5829af6354..8d47ad2fca1d307bd150a985224b1099e492626a 100644 (file)
@@ -29,7 +29,7 @@ EstOutput SharedChao1::getValues(vector<SharedRAbundVector*> shared){
                //create and initialize trees to 0.
                initialTree(numGroups);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //get bin values and calc shared 
                        bool sharedByAll = true;
                        temp.clear();
index ac3e94eaffaa8ac339935a97267c56b0d4e832db..ed21335eddc1ab2360d74b761ea9eff842c6791a 100644 (file)
@@ -21,7 +21,7 @@ EstOutput Jclass::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index 6aad5df26f2a6912b950df9e8d8211e730ef0fe1..5c91ddb65c654c5788c748596e87ddda94ae25f1 100644 (file)
@@ -21,7 +21,7 @@ EstOutput Kulczynski::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index de90252769a3925c96a463a62c63a1349c3ef710..8c8b7f7f4e181760b5bc9c7adb8117ab535b5c3d 100644 (file)
@@ -21,7 +21,7 @@ EstOutput KulczynskiCody::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index f3ed5ece3850f6528ecb56cc99a17765b7766ff7..52192756e00cbb5487e1baa9988c1ba0c709ed5a 100644 (file)
@@ -21,7 +21,7 @@ EstOutput Lennon::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index bd4923f32ddf4337ce6f8570fd57fcd199ccaf6f..16a759089f079e426d302745e34575f55640f30c 100644 (file)
@@ -20,14 +20,14 @@ EstOutput MorHorn::getValues(vector<SharedRAbundVector*> shared) {
                morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
                
                //get the total values we need to calculate the theta denominator sums
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        Atotal += shared[0]->getAbundance(i);
                        Btotal += shared[1]->getAbundance(i);
                }
                
                //calculate the denominator sums
-               for (int j = 0; j < shared[0]->size(); j++) {
+               for (int j = 0; j < shared[0]->getNumBins(); j++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(j);
                        tempB = shared[1]->getAbundance(j);
index 004e535bdb37d4247ec9f9d8aa0e4ad96e4c4ccf..b49fa4a3cc5730b0a97b0eefe30f952a7671f932 100644 (file)
@@ -21,7 +21,7 @@ EstOutput Ochiai::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index 36cacd6935727ba1763b436e25f334ab2c1862bc..f9b5eddcd919acdd409c9b0792294034dbcb7391 100644 (file)
@@ -19,7 +19,7 @@ EstOutput SharedSobs::getValues(vector<SharedRAbundVector*> shared){
                double observed = 0;
 
                //loop through the species in each group
-               for (int k = 0; k < shared[0]->size(); k++) {
+               for (int k = 0; k < shared[0]->getNumBins(); k++) {
                        //if you have found a new species
                        if (shared[0]->getAbundance(k) != 0) { observed++; } 
                        else if ((shared[0]->getAbundance(k) == 0) && (shared[1]->getAbundance(k) != 0)) { observed++; }
index e2e169c58788dfcd7e9aa313b7105a446bef7867..fffed0290b1136da46e42d31c99542da19ec9a0a 100644 (file)
@@ -19,7 +19,7 @@ EstOutput SharedSobsCS::getValues(vector<SharedRAbundVector*> shared){
                double observed = 0;
                int numGroups = shared.size();
 
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //get bin values and set sharedByAll 
                        bool sharedByAll = true;
                        for (int j = 0; j < numGroups; j++) {
index 32728f575a52f94c9d7b8b287fd14a7147ad53e9..85609dad69311e08474a2115755d74761052ab13 100644 (file)
@@ -21,7 +21,7 @@ EstOutput SorClass::getValues(vector<SharedRAbundVector*> shared) {
 
                data.resize(1,0);
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);
index 7c42a18d662f9b8d976a39515a47c4e79d509e22..644adee677820f92a7c8cf87c41d65908a1e957b 100644 (file)
@@ -20,14 +20,14 @@ EstOutput ThetaN::getValues(vector<SharedRAbundVector*> shared) {
                numerator = 0.0; denominator = 0.0; thetaN = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
                
                //get the total values we need to calculate the theta denominator sums
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        Atotal += shared[0]->getAbundance(i);
                        Btotal += shared[1]->getAbundance(i);
                }
                
                //calculate the theta denominator sums
-               for (int j = 0; j < shared[0]->size(); j++) {
+               for (int j = 0; j < shared[0]->getNumBins(); j++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(j);
                        tempB = shared[1]->getAbundance(j);
index 315a61f1310f8381e7fab74546f059d1823f11f7..6c0f6c7f91bf46bf314c7e16cfcf6997bed4f388 100644 (file)
@@ -29,14 +29,14 @@ EstOutput ThetaYC::getValues(vector<SharedRAbundVector*> shared) {
                double sumPsqQ = 0;
                
                //get the total values we need to calculate the theta denominator sums
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        Atotal += (double)shared[0]->getAbundance(i);
                        Btotal += (double)shared[1]->getAbundance(i);
                }
                
                //calculate the theta denominator sums
-               for (int j = 0; j < shared[0]->size(); j++) {
+               for (int j = 0; j < shared[0]->getNumBins(); j++) {
                        //store in temps to avoid multiple repetitive function calls
                        pi = shared[0]->getAbundance(j) / Atotal;
                        qi = shared[1]->getAbundance(j) / Btotal;
diff --git a/subsample.cpp b/subsample.cpp
new file mode 100644 (file)
index 0000000..d5b4e3e
--- /dev/null
@@ -0,0 +1,192 @@
+//
+//  subsample.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/2/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "subsample.h"
+
+//**********************************************************************************************************************
+vector<SharedRAbundVector*> SubSample::getSamplePreserve(vector<SharedRAbundVector*>& thislookup, vector<string>& newLabels, int size) {
+       try {
+               
+        vector<SharedRAbundVector*> newlookup; newlookup.resize(thislookup.size(), NULL); 
+        
+               //save mothurOut's binLabels to restore for next label
+               vector<string> saveBinLabels = m->currentBinLabels;
+               
+               int numBins = thislookup[0]->getNumBins();
+               for (int i = 0; i < thislookup.size(); i++) {           
+                       int thisSize = thislookup[i]->getNumSeqs();
+                       
+                       if (thisSize != size) {
+                               
+                               string thisgroup = thislookup[i]->getGroup();
+                               
+                               OrderVector order;
+                               for(int p=0;p<numBins;p++){
+                                       for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+                                               order.push_back(p);
+                                       }
+                               }
+                               random_shuffle(order.begin(), order.end());
+                               
+                               SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+                               temp->setLabel(thislookup[i]->getLabel());
+                               temp->setGroup(thislookup[i]->getGroup());
+                               
+                               newlookup[i] = temp;
+                               
+                               for (int j = 0; j < size; j++) {
+                                       
+                                       if (m->control_pressed) {  return newlookup; }
+                                       
+                                       int bin = order.get(j);
+                                       
+                                       int abund = newlookup[i]->getAbundance(bin);
+                                       newlookup[i]->set(bin, (abund+1), thisgroup);
+                               }       
+                       }
+               }
+               
+               //subsampling may have created some otus with no sequences in them
+               eliminateZeroOTUS(newlookup);
+               
+               if (m->control_pressed) { return newlookup; }
+               
+               //save mothurOut's binLabels to restore for next label
+        newLabels = m->currentBinLabels;
+               m->currentBinLabels = saveBinLabels;
+               
+               return newlookup;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "getSamplePreserve");
+               exit(1);
+       }
+}      
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
+       try {
+               
+               //save mothurOut's binLabels to restore for next label
+               vector<string> saveBinLabels = m->currentBinLabels;
+               
+               int numBins = thislookup[0]->getNumBins();
+               for (int i = 0; i < thislookup.size(); i++) {           
+                       int thisSize = thislookup[i]->getNumSeqs();
+                       
+                       if (thisSize != size) {
+                               
+                               string thisgroup = thislookup[i]->getGroup();
+                               
+                               OrderVector order;
+                               for(int p=0;p<numBins;p++){
+                                       for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+                                               order.push_back(p);
+                                       }
+                               }
+                               random_shuffle(order.begin(), order.end());
+                               
+                               SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+                               temp->setLabel(thislookup[i]->getLabel());
+                               temp->setGroup(thislookup[i]->getGroup());
+                               
+                               delete thislookup[i];
+                               thislookup[i] = temp;
+                               
+                               
+                               for (int j = 0; j < size; j++) {
+                                       
+                                       if (m->control_pressed) {  return m->currentBinLabels; }
+                                       
+                                       int bin = order.get(j);
+                                       
+                                       int abund = thislookup[i]->getAbundance(bin);
+                                       thislookup[i]->set(bin, (abund+1), thisgroup);
+                               }       
+                       }
+               }
+               
+               //subsampling may have created some otus with no sequences in them
+               eliminateZeroOTUS(thislookup);
+               
+               if (m->control_pressed) { return m->currentBinLabels; }
+               
+               //save mothurOut's binLabels to restore for next label
+        vector<string> subsampleBinLabels = m->currentBinLabels;
+               m->currentBinLabels = saveBinLabels;
+               
+               return subsampleBinLabels;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "getSample");
+               exit(1);
+       }
+}      
+//**********************************************************************************************************************
+int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
+       try {
+               
+               vector<SharedRAbundVector*> newLookup;
+               for (int i = 0; i < thislookup.size(); i++) {
+                       SharedRAbundVector* temp = new SharedRAbundVector();
+                       temp->setLabel(thislookup[i]->getLabel());
+                       temp->setGroup(thislookup[i]->getGroup());
+                       newLookup.push_back(temp);
+               }
+               
+               //for each bin
+               vector<string> newBinLabels;
+               string snumBins = toString(thislookup[0]->getNumBins());
+               for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
+                       if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
+                       
+                       //look at each sharedRabund and make sure they are not all zero
+                       bool allZero = true;
+                       for (int j = 0; j < thislookup.size(); j++) {
+                               if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
+                       }
+                       
+                       //if they are not all zero add this bin
+                       if (!allZero) {
+                               for (int j = 0; j < thislookup.size(); j++) {
+                                       newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
+                               }
+                               //if there is a bin label use it otherwise make one
+                               string binLabel = "Otu";
+                               string sbinNumber = toString(i+1);
+                               if (sbinNumber.length() < snumBins.length()) { 
+                                       int diff = snumBins.length() - sbinNumber.length();
+                                       for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                               }
+                               binLabel += sbinNumber; 
+                               if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
+                               
+                               newBinLabels.push_back(binLabel);
+                       }
+               }
+               
+               for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
+               thislookup.clear();
+               
+               thislookup = newLookup;
+               m->currentBinLabels = newBinLabels;
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "eliminateZeroOTUS");
+               exit(1);
+       }
+}
+
+
+//**********************************************************************************************************************
+
+
diff --git a/subsample.h b/subsample.h
new file mode 100644 (file)
index 0000000..9156e09
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef Mothur_subsample_h
+#define Mothur_subsample_h
+
+//
+//  subsample.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/2/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothurout.h"
+#include "sharedrabundvector.h"
+
+//subsampling overwrites the sharedRabunds.  If you need to reuse the original use the getSamplePreserve function.
+
+class SubSample {
+       
+    public:
+    
+        SubSample() { m = MothurOut::getInstance(); }
+        ~SubSample() {}
+    
+        vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times.
+    
+        vector<SharedRAbundVector*> getSamplePreserve(vector<SharedRAbundVector*>&, vector<string>&, int);
+    
+    private:
+    
+        MothurOut* m;
+        int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
+
+};
+
+#endif
index d4e2c752096318748cafa69615a24627f8270aa8..717b1d3231c20368a2d23e9b86d52a6707f03b12 100644 (file)
@@ -10,6 +10,7 @@
 #include "subsamplecommand.h"
 #include "sharedutilities.h"
 #include "deconvolutecommand.h"
+#include "subsample.h"
 
 //**********************************************************************************************************************
 vector<string> SubSampleCommand::setParameters(){      
@@ -801,68 +802,28 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup) {
                string thisOutputDir = outputDir;
                if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
                string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile);
-               
-               
-               ofstream out;
+        
+        SubSample sample;
+        vector<string> subsampledLabels = sample.getSample(thislookup, size);
+        
+        if (m->control_pressed) {  return 0; }
+        
+        ofstream out;
                m->openOutputFile(outputFileName, out);
                outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
                
-               int numBins = thislookup[0]->getNumBins();
-               for (int i = 0; i < thislookup.size(); i++) {           
-                       int thisSize = thislookup[i]->getNumSeqs();
-                       
-                       if (thisSize != size) {
-                               
-                               string thisgroup = thislookup[i]->getGroup();
-                               
-                               OrderVector* order = new OrderVector();
-                               for(int p=0;p<numBins;p++){
-                                       for(int j=0;j<thislookup[i]->getAbundance(p);j++){
-                                               order->push_back(p);
-                                       }
-                               }
-                               random_shuffle(order->begin(), order->end());
-                               
-                               SharedRAbundVector* temp = new SharedRAbundVector(numBins);
-                               temp->setLabel(thislookup[i]->getLabel());
-                               temp->setGroup(thislookup[i]->getGroup());
-                               
-                               delete thislookup[i];
-                               thislookup[i] = temp;
-                               
-                               
-                               for (int j = 0; j < size; j++) {
-                                       
-                                       if (m->control_pressed) { delete order; out.close(); return 0; }
-                                       
-                                       //get random number to sample from order between 0 and thisSize-1.
-                                       //don't need this because of the random shuffle above
-                                       //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                                       
-                                       int bin = order->get(j);
-                                       
-                                       int abund = thislookup[i]->getAbundance(bin);
-                                       thislookup[i]->set(bin, (abund+1), thisgroup);
-                               }       
-                               delete order;
-                       }
-               }
-               
-               //subsampling may have created some otus with no sequences in them
-               eliminateZeroOTUS(thislookup);
-               
-               if (m->control_pressed) { out.close(); return 0; }
-               
+        m->currentBinLabels = subsampledLabels;
+        
                thislookup[0]->printHeaders(out);
                
                for (int i = 0; i < thislookup.size(); i++) {
                        out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
                        thislookup[i]->print(out);
                }
-               
                out.close();
-               
-               //save mothurOut's binLabels to restore for next label
+        
+        
+        //save mothurOut's binLabels to restore for next label
                m->currentBinLabels = saveBinLabels;
                
                return 0;
@@ -1523,64 +1484,6 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) {
        }
 }                      
 //**********************************************************************************************************************
-int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
-       try {
-               
-               vector<SharedRAbundVector*> newLookup;
-               for (int i = 0; i < thislookup.size(); i++) {
-                       SharedRAbundVector* temp = new SharedRAbundVector();
-                       temp->setLabel(thislookup[i]->getLabel());
-                       temp->setGroup(thislookup[i]->getGroup());
-                       newLookup.push_back(temp);
-               }
-               
-               //for each bin
-               vector<string> newBinLabels;
-               string snumBins = toString(thislookup[0]->getNumBins());
-               for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
-                       if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
-                       
-                       //look at each sharedRabund and make sure they are not all zero
-                       bool allZero = true;
-                       for (int j = 0; j < thislookup.size(); j++) {
-                               if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
-                       }
-                       
-                       //if they are not all zero add this bin
-                       if (!allZero) {
-                               for (int j = 0; j < thislookup.size(); j++) {
-                                       newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
-                               }
-                               //if there is a bin label use it otherwise make one
-                               string binLabel = "Otu";
-                               string sbinNumber = toString(i+1);
-                               if (sbinNumber.length() < snumBins.length()) { 
-                                       int diff = snumBins.length() - sbinNumber.length();
-                                       for (int h = 0; h < diff; h++) { binLabel += "0"; }
-                               }
-                               binLabel += sbinNumber; 
-                               if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
-                               
-                               newBinLabels.push_back(binLabel);
-                       }
-               }
-               
-               for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
-               thislookup.clear();
-               
-               thislookup = newLookup;
-               m->currentBinLabels = newBinLabels;
-               
-               return 0;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
-               exit(1);
-       }
-}
-
-//**********************************************************************************************************************
 
 
 
index 4be357059793f3c06d574ac227b4cc8049383479..7235a7b68ec569412bb97d1b98f076ff3b439bdb 100644 (file)
@@ -45,7 +45,6 @@ private:
        vector<string> names;
        map<string, vector<string> > nameMap;
        
-       int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
        int getSubSampleShared();
        int getSubSampleList();
        int getSubSampleRabund();
index 8aa166610a3b7d3a38bfd882db6206d6982af4de..f0ca81e99748534f3eff5b28d401e04ec8b116b6 100644 (file)
--- a/uvest.cpp
+++ b/uvest.cpp
@@ -29,7 +29,7 @@ EstOutput UVEst::getUVest(vector<SharedRAbundVector*> shared) {
                sumSharedA1 = the sum of all shared otus in A where B = 1
                sumSharedB1 = the sum of all shared otus in B where A = 1 */
                
-               for (int i = 0; i < shared[0]->size(); i++) {
+               for (int i = 0; i < shared[0]->getNumBins(); i++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(i);
                        tempB = shared[1]->getAbundance(i);