]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed bug with dist.shared subsampling. added mode parameter to dist.shared so...
authorSarah Westcott <mothur.westcott@gmail.com>
Mon, 11 Jun 2012 16:13:55 +0000 (12:13 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Mon, 11 Jun 2012 16:13:55 +0000 (12:13 -0400)
chimerauchimecommand.cpp
matrixoutputcommand.cpp
matrixoutputcommand.h
sequenceparser.cpp
shhhercommand.cpp
subsample.cpp
summarysharedcommand.cpp

index be0421a73097bc47aa98934ba4ae59a320e02010..b1d996b0f8d971c0fca0124b29031f2c88e5fecb 100644 (file)
@@ -994,7 +994,8 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam
                        if (m->control_pressed) { return 0; }
                        
                        //remove file made for uchime
-                       m->mothurRemove(filename);
+                       if (!m->debug) {  m->mothurRemove(filename);  }
+            else { m->mothurOut("[DEBUG]: saving file: " + filename + ".\n"); }
                        
                        //append files
                        m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i]));
index 05cd18a720e567e3e7c1852b436b68b1a38814f0..e1c8a984d7539dcc36379bc18df72e7f6002aeac 100644 (file)
@@ -19,6 +19,7 @@ vector<string> MatrixOutputCommand::setParameters(){
                CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
                CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
+        CommandParameter pmode("mode", "Multiple", "average-median", "average", "", "", "",false,false); parameters.push_back(pmode);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
         CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
@@ -38,13 +39,14 @@ string MatrixOutputCommand::getHelpString(){
        try {
                string helpString = "";
                ValidCalculators validCalculator;
-               helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters and label.  shared is a required, unless you have a valid current file.\n";
+               helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters, mode, and label.  shared is a required, unless you have a valid current file.\n";
                helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n";
                helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n";
         helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
         helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n";
                helpString += "The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n";
                helpString += "The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n";
+        helpString += "The mode parameter allows you to specify if you want the average or the median values reported when subsampling. Options are average, and median. The default is average.\n";
                helpString += "Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n";
                helpString += "The default value for groups is all the groups in your groupfile.\n";
                helpString += "The default value for calc is jclass and thetayc.\n";
@@ -140,6 +142,9 @@ MatrixOutputCommand::MatrixOutputCommand(string option)  {
                        
                        output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "lt"; }
                        if ((output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are lt and square. I will use lt."); m->mothurOutEndLine(); output = "lt"; }
+            
+            mode = validParameter.validFile(parameters, "mode", false);                if(mode == "not found"){        mode = "average"; }
+                       if ((mode != "average") && (mode != "median")) { m->mothurOut(mode + " is not a valid mode. Options are average and medina. I will use average."); m->mothurOutEndLine(); output = "average"; }
                        
                        groups = validParameter.validFile(parameters, "groups", false);                 
                        if (groups == "not found") { groups = ""; }
@@ -620,11 +625,16 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
             }
             
             if (subsample && (thisIter != 0)) {  
+                if((thisIter) % 100 == 0){     m->mothurOut(toString(thisIter)); m->mothurOutEndLine();                }
                 calcDistsTotals.push_back(calcDists);
+                for (int i = 0; i < calcDists.size(); i++) {
+                    for (int j = 0; j < calcDists[i].size(); j++) {
+                        if (m->debug) {  m->mothurOut("[DEBUG]: Results: iter = " + toString(thisIter) + ", " + thisLookup[calcDists[i][j].seq1]->getGroup() + " - " + thisLookup[calcDists[i][j].seq2]->getGroup() + " distance = " + toString(calcDists[i][j].dist) + ".\n");  }
+                    } 
+                }
                 //clean up memory
                 for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                 thisItersLookup.clear();
-                for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
             }else { //print results for whole dataset
                 for (int i = 0; i < calcDists.size(); i++) {
                     if (m->control_pressed) { break; }
@@ -654,6 +664,7 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                     outDist.close();
                 }
             }
+            for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
                }
                
         if (iters != 1) {
@@ -664,34 +675,46 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                 calcAverages[i].resize(calcDistsTotals[0][i].size());
                 
                 for (int j = 0; j < calcAverages[i].size(); j++) {
-                    calcAverages[i][j].seq1 = calcDists[i][j].seq1;
-                    calcAverages[i][j].seq2 = calcDists[i][j].seq2;
+                    calcAverages[i][j].seq1 = calcDistsTotals[0][i][j].seq1;
+                    calcAverages[i][j].seq2 = calcDistsTotals[0][i][j].seq2;
                     calcAverages[i][j].dist = 0.0;
                 }
             }
-            
-            for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
-                for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+            if (mode == "average") {
+                for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+                    for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+                        for (int j = 0; j < calcAverages[i].size(); j++) {
+                            calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+                            if (m->debug) {  m->mothurOut("[DEBUG]: Totaling for average calc: iter = " + toString(thisIter) + ", " + thisLookup[calcDistsTotals[thisIter][i][j].seq1]->getGroup() + " - " + thisLookup[calcDistsTotals[thisIter][i][j].seq2]->getGroup() + " distance = " + toString(calcDistsTotals[thisIter][i][j].dist) + ". New total = " + toString(calcAverages[i][j].dist) + ".\n");  }
+                        }
+                    }
+                }
+                
+                for (int i = 0; i < calcAverages.size(); i++) {  //finds average.
                     for (int j = 0; j < calcAverages[i].size(); j++) {
-                        calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+                        calcAverages[i][j].dist /= (float) iters;
                     }
                 }
-            }
-            
-            for (int i = 0; i < calcAverages.size(); i++) {  //finds average.
-                for (int j = 0; j < calcAverages[i].size(); j++) {
-                    calcAverages[i][j].dist /= (float) iters;
+            }else { //find median
+                for (int i = 0; i < calcAverages.size(); i++) { //for each calc
+                    for (int j = 0; j < calcAverages[i].size(); j++) {  //for each comparison
+                        vector<double> dists;
+                        for (int thisIter = 0; thisIter < iters; thisIter++) { //for each subsample
+                            dists.push_back(calcDistsTotals[thisIter][i][j].dist);
+                        }
+                        sort(dists.begin(), dists.end());
+                        calcAverages[i][j].dist = dists[(iters/2)];
+                    }
                 }
             }
-            
             //find standard deviation
             vector< vector<seqDist>  > stdDev; stdDev.resize(matrixCalculators.size());
             for (int i = 0; i < stdDev.size(); i++) {  //initialize sums to zero.
                 stdDev[i].resize(calcDistsTotals[0][i].size());
                 
                 for (int j = 0; j < stdDev[i].size(); j++) {
-                    stdDev[i][j].seq1 = calcDists[i][j].seq1;
-                    stdDev[i][j].seq2 = calcDists[i][j].seq2;
+                    stdDev[i][j].seq1 = calcDistsTotals[0][i][j].seq1;
+                    stdDev[i][j].seq2 = calcDistsTotals[0][i][j].seq2;
                     stdDev[i][j].dist = 0.0;
                 }
             }
@@ -768,6 +791,7 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
 int MatrixOutputCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, vector< vector<seqDist> >& calcDists) { 
        try {
                vector<SharedRAbundVector*> subset;
+        
                for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
                        
                        for (int l = 0; l < k; l++) {
index 8af539ba01ae59ed07027d63f2bd9d1859b4c217..594fe30952a19921dd3e78b330661bc490442fdc 100644 (file)
@@ -101,7 +101,7 @@ private:
 
        bool abort, allLines, subsample;
        set<string> labels; //holds labels to be used
-       string outputFile, calc, groups, label, outputDir;
+       string outputFile, calc, groups, label, outputDir, mode;
        vector<string>  Estimators, Groups, outputNames; //holds estimators to be used
        int process(vector<SharedRAbundVector*>, string, string);
        int driver(vector<SharedRAbundVector*>, int, int, vector< vector<seqDist> >&);
index fd94b246a43217cc9cbfa131bfe4cd672955e001..3eb508dd737e15d4f06cb04c202608b5d5e1fe7c 100644 (file)
@@ -37,13 +37,16 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                m->openInputFile(fastaFile, in);
                
                map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
+        int fastaCount = 0;
                while (!in.eof()) {
                        
                        if (m->control_pressed) { break; }
                        
                        Sequence seq(in); m->gobble(in);
+            fastaCount++;
+            if (m->debug) { if((fastaCount) % 1000 == 0){      m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
                        
-                       if (seq.getName() != "") {
+        if (seq.getName() != "") {
                                
                                 string group = groupMap->getGroup(seq.getName());
                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
@@ -133,7 +136,7 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                inName.close();
                
                if (error == 1) { m->control_pressed = true; }
-               
+                       
                if (countName != (groupMap->getNumSeqs())) {
                        vector<string> groupseqsnames = groupMap->getNamesSeqs();
                        
@@ -253,6 +256,7 @@ vector<Sequence> SequenceParser::getSeqs(string g){
                        m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
                }else {
                        seqForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
                }
                
                return seqForThisGroup; 
@@ -346,6 +350,7 @@ map<string, string> SequenceParser::getNameMap(string g){
                        m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
                }else {
                        nameMapForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences.");  }
                }
                
                return nameMapForThisGroup; 
index 08bb017a4d025113c7317167e3eb6217fa57638d..8ae76d9836c31b0d3a37ac41d58a6aa54db6e859 100644 (file)
@@ -938,6 +938,8 @@ void ShhherCommand::initPyroCluster(){
     try{
         if (numOTUs < processors) { processors = 1; }
         
+        if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+        
         dist.assign(numSeqs * numOTUs, 0);
         change.assign(numOTUs, 1);
         centroids.assign(numOTUs, -1);
@@ -947,6 +949,8 @@ void ShhherCommand::initPyroCluster(){
         nSeqsBreaks.assign(processors+1, 0);
         nOTUsBreaks.assign(processors+1, 0);
         
+        if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
+        
         nSeqsBreaks[0] = 0;
         for(int i=0;i<processors;i++){
             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
@@ -2137,6 +2141,7 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                 vector<int> uniqueLengths;
                 int numFlowCells;
                 
+                if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
                 
                 if (m->control_pressed) { break; }
@@ -2193,6 +2198,8 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                 vector<int> nSeqsBreaks;
                 vector<int> nOTUsBreaks;
                 
+                if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+                
                 dist.assign(numSeqs * numOTUs, 0);
                 change.assign(numOTUs, 1);
                 centroids.assign(numOTUs, -1);
@@ -2206,6 +2213,8 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                 nSeqsBreaks[1] = numSeqs;
                 nOTUsBreaks[1] = numOTUs;
                 
+                if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
+                
                 if (m->control_pressed) { break; }
                 
                 double maxDelta = 0;
@@ -2336,17 +2345,21 @@ int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVecto
                thisNameMap.clear();
                
                flowFile >> numFlowCells;
+        if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
                int index = 0;//pcluster
                while(!flowFile.eof()){
                        
                        if (m->control_pressed) { break; }
                        
                        flowFile >> seqName >> currentNumFlowCells;
+            
                        thisLengths.push_back(currentNumFlowCells);
            
                        thisSeqNameVector.push_back(seqName);
                        thisNameMap[seqName] = index++;//pcluster
-
+            
+            if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
+            
                        for(int i=0;i<numFlowCells;i++){
                                flowFile >> intensity;
                                if(intensity > 9.99)    {       intensity = 9.99;       }
@@ -2634,6 +2647,8 @@ int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuDat
                
                listFile >> label >> numOTUs;
         
+        if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
+        
                otuData.assign(numSeqs, 0);
                cumNumSeqs.assign(numOTUs, 0);
                nSeqsPerOTU.assign(numOTUs, 0);
@@ -2648,6 +2663,7 @@ int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuDat
                for(int i=0;i<numOTUs;i++){
                        
                        if (m->control_pressed) { break; }
+            if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
             
                        listFile >> singleOTU;
                        
index f7da25fed8d72fce904d425c713fe43d92f2637c..261297df67cfc1c5a8933a72f8bba307defbef88 100644 (file)
@@ -250,7 +250,7 @@ vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int
                
                //subsampling may have created some otus with no sequences in them
                eliminateZeroOTUS(thislookup);
-               
+        
                if (m->control_pressed) { return m->currentBinLabels; }
                
                //save mothurOut's binLabels to restore for next label
index ed028045f9ff87713fc7d8a283805858eb2f4b01..6792f19f219ad99f0bb13d0eec3b84ae20e67c88 100644 (file)
@@ -744,7 +744,6 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                 //clean up memory
                 for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                 thisItersLookup.clear();
-                for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
             }else {
                 if (createPhylip) {
                     for (int i = 0; i < calcDists.size(); i++) {
@@ -776,6 +775,7 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                     }
                 }
             }
+            for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
                }
 
         if (iters != 1) {