]> git.donarmstrong.com Git - mothur.git/commitdiff
added check to make sure shhh.flows child processes finish properly. added subsamplin...
authorSarah Westcott <mothur.westcott@gmail.com>
Mon, 4 Jun 2012 16:27:23 +0000 (12:27 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Mon, 4 Jun 2012 16:27:23 +0000 (12:27 -0400)
matrixoutputcommand.cpp
shhhercommand.cpp
subsample.cpp
subsample.h
summarycommand.cpp
summarycommand.h
summarysharedcommand.cpp

index 87929a4d9f8d8c91998e6e9d9760a22c67c60df0..05cd18a720e567e3e7c1852b436b68b1a38814f0 100644 (file)
@@ -462,11 +462,11 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                vector< vector< vector<seqDist> > > calcDistsTotals;  //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
         vector< vector<seqDist>  > calcDists; calcDists.resize(matrixCalculators.size());              
 
-        for (int thisIter = 0; thisIter < iters; thisIter++) {
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
             
             vector<SharedRAbundVector*> thisItersLookup = thisLookup;
             
-            if (subsample) {
+            if (subsample && (thisIter != 0)) {
                 SubSample sample;
                 vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
                 
@@ -619,14 +619,40 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                 #endif
             }
             
-            calcDistsTotals.push_back(calcDists);
-            
-            if (subsample) {  
-                
+            if (subsample && (thisIter != 0)) {  
+                calcDistsTotals.push_back(calcDists);
                 //clean up memory
                 for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                 thisItersLookup.clear();
                 for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
+            }else { //print results for whole dataset
+                for (int i = 0; i < calcDists.size(); i++) {
+                    if (m->control_pressed) { break; }
+                    
+                    //initialize matrix
+                    vector< vector<double> > matrix; //square matrix to represent the distance
+                    matrix.resize(thisLookup.size());
+                    for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+                    
+                    for (int j = 0; j < calcDists[i].size(); j++) {
+                        int row = calcDists[i][j].seq1;
+                        int column = calcDists[i][j].seq2;
+                        double dist = calcDists[i][j].dist;
+                        
+                        matrix[row][column] = dist;
+                        matrix[column][row] = dist;
+                    }
+                    
+                    string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
+                    outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+                    ofstream outDist;
+                    m->openOutputFile(distFileName, outDist);
+                    outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+                    
+                    printSims(outDist, matrix);
+                    
+                    outDist.close();
+                }
             }
                }
                
@@ -729,35 +755,6 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                 outSTD.close();
 
             }
-        }else {
-        
-            for (int i = 0; i < calcDists.size(); i++) {
-                if (m->control_pressed) { break; }
-                
-                //initialize matrix
-                vector< vector<double> > matrix; //square matrix to represent the distance
-                matrix.resize(thisLookup.size());
-                for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
-                
-                for (int j = 0; j < calcDists[i].size(); j++) {
-                    int row = calcDists[i][j].seq1;
-                    int column = calcDists[i][j].seq2;
-                    double dist = calcDists[i][j].dist;
-                    
-                    matrix[row][column] = dist;
-                    matrix[column][row] = dist;
-                }
-                
-                string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
-                outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
-                ofstream outDist;
-                m->openOutputFile(distFileName, outDist);
-                outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-                
-                printSims(outDist, matrix);
-                
-                outDist.close();
-            }
         }
                
                return 0;
index be410d9d9f4ecec2fc885a239c9ba57b5f409e89..08bb017a4d025113c7317167e3eb6217fa57638d 100644 (file)
@@ -1934,12 +1934,14 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                
                //divide the groups between the processors
                vector<linePair> lines;
+        vector<int> numFilesToComplete;
                int numFilesPerProcessor = filenames.size() / processors;
                for (int i = 0; i < processors; i++) {
                        int startIndex =  i * numFilesPerProcessor;
                        int endIndex = (i+1) * numFilesPerProcessor;
                        if(i == (processors - 1)){      endIndex = filenames.size();    }
                        lines.push_back(linePair(startIndex, endIndex));
+            numFilesToComplete.push_back((endIndex-startIndex));
                }
                
         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)         
@@ -1953,6 +1955,14 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                                process++;
                        }else if (pid == 0){
                                num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
+                
+                //pass numSeqs to parent
+                               ofstream out;
+                               string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
+                               m->openOutputFile(tempFile, out);
+                               out << num << endl;
+                               out.close();
+                
                                exit(0);
                        }else { 
                                m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
@@ -2015,6 +2025,18 @@ int ShhherCommand::createProcesses(vector<string> filenames){
         #endif
         
         for (int i=0;i<processIDS.size();i++) { 
+            ifstream in;
+                       string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
+                       m->openInputFile(tempFile, in);
+                       if (!in.eof()) { 
+                int tempNum = 0; 
+                in >> tempNum; 
+                if (tempNum != numFilesToComplete[i+1]) {
+                    m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
+                }
+            }
+                       in.close(); m->mothurRemove(tempFile);
+            
             if (compositeFASTAFileName != "") {
                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
@@ -2083,6 +2105,8 @@ vector<string> ShhherCommand::parseFlowFiles(string filename){
 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
     try {
         
+        int numCompleted = 0;
+        
         for(int i=start;i<end;i++){
                        
                        if (m->control_pressed) { break; }
@@ -2281,12 +2305,13 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                 }
                        }
             
+            numCompleted++;
                        m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                }
                
         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
         
-        return 0;
+        return numCompleted;
         
     }catch(exception& e) {
             m->errorOut(e, "ShhherCommand", "driver");
index 457b7b9d5f6a1273b57e898b3adec90dff5dd9f0..f7da25fed8d72fce904d425c713fe43d92f2637c 100644 (file)
@@ -322,8 +322,51 @@ int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
                exit(1);
        }
 }
+//**********************************************************************************************************************
+int SubSample::getSample(SAbundVector*& sabund, int size) {
+       try {
+               
+        OrderVector* order = new OrderVector();
+        *order = sabund->getOrderVector(NULL);
+        
+               int numBins = order->getNumBins();
+               int thisSize = order->getNumSeqs();
+        
+               if (thisSize > size) {
+                       random_shuffle(order->begin(), order->end());
+                       
+            RAbundVector* rabund = new RAbundVector(numBins);
+                       rabund->setLabel(order->getLabel());
 
-
+                       for (int j = 0; j < size; j++) {
+                
+                               if (m->control_pressed) { delete order; delete rabund; return 0; }
+                               
+                               int bin = order->get(j);
+                               
+                               int abund = rabund->get(bin);
+                               rabund->set(bin, (abund+1));
+                       }
+                       
+            delete sabund;
+            sabund = new SAbundVector();
+            *sabund = rabund->getSAbundVector();
+            delete rabund;
+            
+               }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
+               
+               if (m->control_pressed) { return 0; }
+        
+               delete order;
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSampleCommand", "getSample");
+               exit(1);
+       }
+}                      
 //**********************************************************************************************************************
 
 
index feca37eddbfbf42cd4d7674adea5a18ceed3acee..b00f1a7141b49fb99a9683a58b96153c5986a6b7 100644 (file)
@@ -27,6 +27,7 @@ class SubSample {
         
         //Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
         Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map<string, string>); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
+        int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it
     
     private:
     
index 83ea8c9b4b5ab077283454089b35d9de13908003..85f0970f25930f563616c5fba9bc27fecda9cf98 100644 (file)
@@ -33,6 +33,7 @@
 #include "boneh.h"
 #include "solow.h"
 #include "shen.h"
+#include "subsample.h"
 
 //**********************************************************************************************************************
 vector<string> SummaryCommand::setParameters(){        
@@ -41,6 +42,8 @@ vector<string> SummaryCommand::setParameters(){
                CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(prabund);
                CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(psabund);
                CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(pshared);
+        CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+        CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "",true,false); parameters.push_back(pcalc);
                CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund);
@@ -63,11 +66,13 @@ string SummaryCommand::getHelpString(){
        try {
                string helpString = "";
                ValidCalculators validCalculator;
-               helpString += "The summary.single command parameters are list, sabund, rabund, shared, label, calc, abund and groupmode.  list, sabund, rabund or shared is required unless you have a valid current file.\n";
+               helpString += "The summary.single command parameters are list, sabund, rabund, shared, subsample, iters, label, calc, abund and groupmode.  list, sabund, rabund or shared is required unless you have a valid current file.\n";
                helpString += "The summary.single command should be in the following format: \n";
                helpString += "summary.single(label=yourLabel, calc=yourEstimators).\n";
                helpString += "Example summary.single(label=unique-.01-.03, calc=sobs-chao-ace-jack-bootstrap-shannon-npshannon-simpson).\n";
                helpString += validCalculator.printCalc("summary");
+        helpString += "The subsample parameter allows you to enter the size of the sample or you can set subsample=T and mothur will use the size of your smallest group in the case of a shared file. With a list, sabund or rabund file you must provide a subsample size.\n";
+        helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
                helpString += "The default value calc is sobs-chao-ace-jack-shannon-npshannon-simpson\n";
                helpString += "If you are running summary.single with a shared file and would like your summary results collated in one file, set groupmode=t. (Default=true).\n";
                helpString += "The label parameter is used to analyze specific labels in your input.\n";
@@ -239,7 +244,22 @@ SummaryCommand::SummaryCommand(string option)  {
                        temp = validParameter.validFile(parameters, "groupmode", false);                if (temp == "not found") { temp = "T"; }
                        groupMode = m->isTrue(temp);
                        
-       
+            temp = validParameter.validFile(parameters, "iters", false);                       if (temp == "not found") { temp = "1000"; }
+                       m->mothurConvert(temp, iters);
+            
+            temp = validParameter.validFile(parameters, "subsample", false);           if (temp == "not found") { temp = "F"; }
+                       if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+            else {  
+                if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
+                else { subsample = false; subsampleSize = -1; }
+            }
+            
+            if (subsample == false) { iters = 1; }
+            else {
+                //if you did not set a samplesize and are not using a sharedfile
+                if ((subsampleSize == -1) && (format != "sharedfile"))  { m->mothurOut("[ERROR]: If you want to subsample with a list, rabund or sabund file, you must provide the sample size.  You can do this by setting subsample=yourSampleSize.\n");  abort=true; }
+            }
+
                }
        }
        catch(exception& e) {
@@ -261,17 +281,23 @@ int SummaryCommand::execute(){
                
                int numLines = 0;
                int numCols = 0;
-               
+               map<string, string> groupIndex;
+        
                for (int p = 0; p < inputFileNames.size(); p++) {
                        
                        numLines = 0;
                        numCols = 0;
                        
                        string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "summary";
+            string fileNameAve = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "ave";
+            string fileNameSTD = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "std";
                        outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot);
+            
+            
                        
                        if (inputFileNames.size() > 1) {
                                m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine();
+                groupIndex[fileNameRoot] = groups[p];
                        }
                        
                        sumCalculators.clear();
@@ -342,6 +368,21 @@ int SummaryCommand::execute(){
                        ofstream outputFileHandle;
                        m->openOutputFile(fileNameRoot, outputFileHandle);
                        outputFileHandle << "label";
+            
+            ofstream outAve, outSTD;
+            if (subsample) {
+                m->openOutputFile(fileNameAve, outAve);
+                m->openOutputFile(fileNameSTD, outSTD);
+                outputNames.push_back(fileNameAve); outputTypes["ave"].push_back(fileNameAve);
+                outputNames.push_back(fileNameSTD); outputTypes["std"].push_back(fileNameSTD);
+                outAve << "label"; outSTD << "label";
+                outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint);
+                outSTD.setf(ios::fixed, ios::floatfield); outSTD.setf(ios::showpoint);
+                if (inputFileNames.size() > 1) {
+                    groupIndex[fileNameAve] = groups[p];
+                    groupIndex[fileNameSTD] = groups[p];
+                }
+            }
                
                        input = new InputData(inputFileNames[p], format);
                        sabund = input->getSAbundVector();
@@ -350,24 +391,29 @@ int SummaryCommand::execute(){
                        for(int i=0;i<sumCalculators.size();i++){
                                if(sumCalculators[i]->getCols() == 1){
                                        outputFileHandle << '\t' << sumCalculators[i]->getName();
+                    if (subsample) { outAve << '\t' << sumCalculators[i]->getName(); outSTD << '\t' << sumCalculators[i]->getName(); }
                                        numCols++;
                                }
                                else{
                                        outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci";
+                    if (subsample) { outAve << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; outSTD << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
                                        numCols += 3;
                                }
                        }
                        outputFileHandle << endl;
+            if (subsample) { outSTD << endl; outAve << endl; }
                        
                        //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                        set<string> processedLabels;
                        set<string> userLabels = labels;
                        
-                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
+            
+            
+                       if (m->control_pressed) {  outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                        
                        while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                                
-                               if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
+                               if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                
                                if(allLines == 1 || labels.count(sabund->getLabel()) == 1){                     
                                        
@@ -375,16 +421,9 @@ int SummaryCommand::execute(){
                                        processedLabels.insert(sabund->getLabel());
                                        userLabels.erase(sabund->getLabel());
                                        
-                                       outputFileHandle << sabund->getLabel();
-                                       for(int i=0;i<sumCalculators.size();i++){
-                                               vector<double> data = sumCalculators[i]->getValues(sabund);
-                                               
-                                               if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
-
-                                               outputFileHandle << '\t';
-                                               sumCalculators[i]->print(outputFileHandle);
-                                       }
-                                       outputFileHandle << endl;
+                    process(sabund, outputFileHandle, outAve, outSTD);
+                    
+                    if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                        numLines++;
                                }
                                
@@ -398,16 +437,9 @@ int SummaryCommand::execute(){
                                        processedLabels.insert(sabund->getLabel());
                                        userLabels.erase(sabund->getLabel());
                                        
-                                       outputFileHandle << sabund->getLabel();
-                                       for(int i=0;i<sumCalculators.size();i++){
-                                               vector<double> data = sumCalculators[i]->getValues(sabund);
-                                               
-                                               if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; } delete sabund;  delete input;  return 0;  }
-                                               
-                                               outputFileHandle << '\t';
-                                               sumCalculators[i]->print(outputFileHandle);
-                                       }
-                                       outputFileHandle << endl;
+                    process(sabund, outputFileHandle, outAve, outSTD);
+                    
+                    if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                        numLines++;
                                        
                                        //restore real lastlabel to save below
@@ -420,7 +452,7 @@ int SummaryCommand::execute(){
                                sabund = input->getSAbundVector();
                        }
                        
-                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
+                       if (m->control_pressed) {  outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
 
                        //output error messages about any remaining user labels
                        set<string>::iterator it;
@@ -441,21 +473,15 @@ int SummaryCommand::execute(){
                                sabund = input->getSAbundVector(lastLabel);
                                
                                m->mothurOut(sabund->getLabel()); m->mothurOutEndLine();
-                               outputFileHandle << sabund->getLabel();
-                               for(int i=0;i<sumCalculators.size();i++){
-                                       vector<double> data = sumCalculators[i]->getValues(sabund);
-                                       
-                                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input; return 0;  }
-
-                                       outputFileHandle << '\t';
-                                       sumCalculators[i]->print(outputFileHandle);
-                               }
-                               outputFileHandle << endl;
+                process(sabund, outputFileHandle, outAve, outSTD);
+                
+                if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                numLines++;
                                delete sabund;
                        }
                        
                        outputFileHandle.close();
+            if (subsample) { outAve.close(); outSTD.close(); }
                        
                        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
 
@@ -467,7 +493,7 @@ int SummaryCommand::execute(){
                if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }  return 0;  }
                
                //create summary file containing all the groups data for each label - this function just combines the info from the files already created.
-               if ((sharedfile != "") && (groupMode)) {   outputNames.push_back(createGroupSummaryFile(numLines, numCols, outputNames));  }
+               if ((sharedfile != "") && (groupMode)) {   vector<string> comboNames = createGroupSummaryFile(numLines, numCols, outputNames, groupIndex);  for (int i = 0; i < comboNames.size(); i++) { outputNames.push_back(comboNames[i]); } }
                
                if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }  return 0;  }
                
@@ -484,6 +510,108 @@ int SummaryCommand::execute(){
        }
 }
 //**********************************************************************************************************************
+int SummaryCommand::process(SAbundVector*& sabund, ofstream& outputFileHandle, ofstream& outAve, ofstream& outStd) {
+    try {
+        
+        //calculator -> data -> values
+        vector< vector< vector<double> > >  results; results.resize(sumCalculators.size());
+        
+        outputFileHandle << sabund->getLabel();
+        
+        SubSample sample;
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
+            
+            SAbundVector* thisIterSabund = sabund;
+            
+            //we want the summary results for the whole dataset, then the subsampling
+            if ((thisIter > 0) && subsample) { //subsample sabund and run it
+                //copy sabund since getSample destroys it
+                RAbundVector rabund = sabund->getRAbundVector();
+                SAbundVector* newSabund = new SAbundVector();
+                *newSabund = rabund.getSAbundVector();
+                
+                sample.getSample(newSabund, subsampleSize);
+                thisIterSabund = newSabund;
+            }
+            
+            for(int i=0;i<sumCalculators.size();i++){
+                vector<double> data = sumCalculators[i]->getValues(thisIterSabund);
+               
+                if (m->control_pressed) {  return 0;  }
+                
+                if (thisIter == 0) {
+                    outputFileHandle << '\t';
+                    sumCalculators[i]->print(outputFileHandle);
+                }else {
+                    //some of the calc have hci and lci need to make room for that
+                    if (results[i].size() == 0) {  results[i].resize(data.size());  }
+                    //save results for ave and std.
+                    for (int j = 0; j < data.size(); j++) {
+                        if (m->control_pressed) {  return 0;  }
+                        results[i][j].push_back(data[j]); 
+                    }
+                }
+            }
+            
+            //cleanup memory
+            if ((thisIter > 0) && subsample) { delete thisIterSabund; }
+        }
+        outputFileHandle << endl;
+     
+        if (subsample) {
+            outAve << sabund->getLabel() << '\t'; outStd << sabund->getLabel() << '\t';
+            //find ave and std for this label and output
+            //will need to modify the createGroupSummary to combine results and not mess with the .summary file.
+            
+            //calcs -> values
+            vector< vector<double> >  calcAverages; calcAverages.resize(sumCalculators.size()); 
+            for (int i = 0; i < calcAverages.size(); i++) {  calcAverages[i].resize(results[i].size(), 0);  }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+                for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+                    for (int j = 0; j < calcAverages[i].size(); j++) {
+                        calcAverages[i][j] += results[i][j][thisIter];
+                    }
+                }
+            }
+            
+            for (int i = 0; i < calcAverages.size(); i++) {  //finds average.
+                for (int j = 0; j < calcAverages[i].size(); j++) {
+                    calcAverages[i][j] /= (float) iters;
+                    outAve << calcAverages[i][j] << '\t';
+                }
+            }
+            
+            //find standard deviation
+            vector< vector<double>  > stdDev; stdDev.resize(sumCalculators.size());
+            for (int i = 0; i < stdDev.size(); i++) {  stdDev[i].resize(results[i].size(), 0);  }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+                for (int i = 0; i < stdDev.size(); i++) {  
+                    for (int j = 0; j < stdDev[i].size(); j++) {
+                        stdDev[i][j] += ((results[i][j][thisIter] - calcAverages[i][j]) * (results[i][j][thisIter] - calcAverages[i][j]));
+                    }
+                }
+            }
+            
+            for (int i = 0; i < stdDev.size(); i++) {  //finds average.
+                for (int j = 0; j < stdDev[i].size(); j++) {
+                    stdDev[i][j] /= (float) iters;
+                    stdDev[i][j] = sqrt(stdDev[i][j]);
+                    outStd << stdDev[i][j] << '\t';
+                }
+            }
+            outAve << endl;  outStd << endl; 
+        }
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SummaryCommand", "process");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
 vector<string> SummaryCommand::parseSharedFile(string filename) {
        try {
                vector<string> filenames;
@@ -496,12 +624,44 @@ vector<string> SummaryCommand::parseSharedFile(string filename) {
                
                string sharedFileRoot = m->getRootName(filename);
                
-               //clears file before we start to write to it below
+        /******************************************************/
+        if (subsample) { 
+            if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+                subsampleSize = lookup[0]->getNumSeqs();
+                for (int i = 1; i < lookup.size(); i++) {
+                    int thisSize = lookup[i]->getNumSeqs();
+                    
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+            }else {
+                m->clearGroups();
+                vector<string> Groups;
+                vector<SharedRAbundVector*> temp;
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (lookup[i]->getNumSeqs() < subsampleSize) { 
+                        m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                        delete lookup[i];
+                    }else { 
+                        Groups.push_back(lookup[i]->getGroup()); 
+                        temp.push_back(lookup[i]);
+                    }
+                } 
+                lookup = temp;
+                m->setGroups(Groups);
+            }
+            
+            if (lookup.size() < 1) { m->mothurOut("You have not provided enough valid groups.  I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return filenames; }
+        }
+        
+               
+               /******************************************************/
+        
+        //clears file before we start to write to it below
                for (int i=0; i<lookup.size(); i++) {
                        m->mothurRemove((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
                        filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
                }
-               
+        
                ofstream* temp;
                for (int i=0; i<lookup.size(); i++) {
                        temp = new ofstream;
@@ -537,21 +697,20 @@ vector<string> SummaryCommand::parseSharedFile(string filename) {
        }
 }
 //**********************************************************************************************************************
-string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames) {
+vector<string> SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames, map<string, string> groupIndex) {
        try {
-               
-               ofstream out;
-               string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups.summary";
-               
-               //open combined file
-               m->openOutputFile(combineFileName, out);
-               
+                               
                //open each groups summary file
+        vector<string> newComboNames;
                string newLabel = "";
-               map<string, vector<string> > files;
+               map<string, map<string, vector<string> > > files;
                for (int i=0; i<outputNames.size(); i++) {
+            string extension = m->getExtension(outputNames[i]);
+            string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+                       m->mothurRemove(combineFileName); //remove old file
+             
                        vector<string> thisFilesLines;
-                       
+            
                        ifstream temp;
                        m->openInputFile(outputNames[i], temp);
                        
@@ -578,7 +737,7 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<
                                        temp >> tempLabel; 
                                                
                                        //save for later
-                                       if (j == 1) { thisLine += groups[i] + "\t" + tempLabel + "\t";  }
+                                       if (j == 1) { thisLine += groupIndex[outputNames[i]] + "\t" + tempLabel + "\t"; }
                                        else{  thisLine += tempLabel + "\t";    }
                                }
                                        
@@ -588,31 +747,52 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<
                                        
                                m->gobble(temp);
                        }
-                               
-                       files[outputNames[i]] = thisFilesLines;
+            
+            map<string, map<string, vector<string> > >::iterator itFiles = files.find(extension);
+            if (itFiles != files.end()) { //add new files info to existing type
+                files[extension][outputNames[i]] = thisFilesLines;
+            }else {
+                map<string, vector<string> > thisFile;
+                thisFile[outputNames[i]] = thisFilesLines;
+                files[extension] = thisFile;
+            }
                        
                        temp.close();
                        m->mothurRemove(outputNames[i]);
                }
                
-               //output label line to new file
-               out << newLabel << endl;
-               
-               //for each label
-               for (int k = 0; k < numLines; k++) {
-               
-                       //grab summary data for each group
-                       for (int i=0; i<outputNames.size(); i++) {
-                               out << files[outputNames[i]][k];
-                       }
-               }       
-               
-               outputNames.clear();
-               
-               out.close();
+        
+        for (map<string, map<string, vector<string> > >::iterator itFiles = files.begin(); itFiles != files.end(); itFiles++) {
+            
+            if (m->control_pressed) { break; }
+            
+            string extension = itFiles->first;
+            map<string, vector<string> > thisType = itFiles->second;
+            string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+            newComboNames.push_back(combineFileName);
+            //open combined file
+            ofstream out;
+            m->openOutputFile(combineFileName, out);
+            
+            //output label line to new file
+            out << newLabel << endl;
+               
+            //for each label
+            for (int k = 0; k < numLines; k++) {
+               
+                //grab summary data for each group
+                for (map<string, vector<string> >::iterator itType = thisType.begin(); itType != thisType.end(); itType++) {
+                    out << (itType->second)[k];
+                }
+            }  
+               
+            outputNames.clear();
+               
+            out.close();
+        }
                
                //return combine file name
-               return combineFileName;
+               return newComboNames;
                
        }
        catch(exception& e) {
index 15b7f4066e9f17b263b30ac3d0b663d7cd414455..1d93a333a8c971458ced6ca8803cb0b2027a48b2 100644 (file)
@@ -37,9 +37,9 @@ private:
        vector<Calculator*> sumCalculators;     
        InputData* input;
        SAbundVector* sabund;
-       int abund, size;
+       int abund, size, iters, subsampleSize;
 
-       bool abort, allLines, groupMode;
+       bool abort, allLines, groupMode, subsample;
        set<string> labels; //holds labels to be used
        string label, calc, outputDir, sharedfile, listfile, rabundfile, sabundfile, format, inputfile;
        vector<string>  Estimators;
@@ -47,7 +47,8 @@ private:
        vector<string> groups;
        
        vector<string> parseSharedFile(string);
-       string createGroupSummaryFile(int, int, vector<string>&);
+       vector<string> createGroupSummaryFile(int, int, vector<string>&, map<string, string>);
+    int process(SAbundVector*&, ofstream&, ofstream&, ofstream&);
 
 
 };
index 77961f1c807387080b914af5059fea03d3844ff4..ed028045f9ff87713fc7d8a283805858eb2f4b01 100644 (file)
@@ -365,11 +365,38 @@ int SummarySharedCommand::execute(){
                        return 0;
                }
                /******************************************************/
-               
+        if (subsample) { 
+            if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+                subsampleSize = lookup[0]->getNumSeqs();
+                for (int i = 1; i < lookup.size(); i++) {
+                    int thisSize = lookup[i]->getNumSeqs();
+                    
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+            }else {
+                m->clearGroups();
+                Groups.clear();
+                vector<SharedRAbundVector*> temp;
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (lookup[i]->getNumSeqs() < subsampleSize) { 
+                        m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                        delete lookup[i];
+                    }else { 
+                        Groups.push_back(lookup[i]->getGroup()); 
+                        temp.push_back(lookup[i]);
+                    }
+                } 
+                lookup = temp;
+                m->setGroups(Groups);
+            }
+            
+            if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups.  I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return 0; }
+        }
+
                
                /******************************************************/
                //comparison breakup to be used by different processes later
-               numGroups = m->getNumGroups();
+               numGroups = lookup.size();
                lines.resize(processors);
                for (int i = 0; i < processors; i++) {
                        lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
@@ -527,11 +554,11 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
         vector< vector< vector<seqDist> > > calcDistsTotals;  //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
         vector< vector<seqDist>  > calcDists; calcDists.resize(sumCalculators.size());                 
         
-        for (int thisIter = 0; thisIter < iters; thisIter++) {
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
             
             vector<SharedRAbundVector*> thisItersLookup = thisLookup;
             
-            if (subsample) {
+            if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
                 SubSample sample;
                 vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
                 
@@ -710,14 +737,44 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                 
 #endif
             }
-            calcDistsTotals.push_back(calcDists);
             
-            if (subsample) {  
+            if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
                 
+                calcDistsTotals.push_back(calcDists); 
                 //clean up memory
                 for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                 thisItersLookup.clear();
                 for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
+            }else {
+                if (createPhylip) {
+                    for (int i = 0; i < calcDists.size(); i++) {
+                        if (m->control_pressed) { break; }
+                        
+                        //initialize matrix
+                        vector< vector<double> > matrix; //square matrix to represent the distance
+                        matrix.resize(thisLookup.size());
+                        for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+                        
+                        for (int j = 0; j < calcDists[i].size(); j++) {
+                            int row = calcDists[i][j].seq1;
+                            int column = calcDists[i][j].seq2;
+                            double dist = calcDists[i][j].dist;
+                            
+                            matrix[row][column] = dist;
+                            matrix[column][row] = dist;
+                        }
+                        
+                        string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
+                        outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+                        ofstream outDist;
+                        m->openOutputFile(distFileName, outDist);
+                        outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+                        
+                        printSims(outDist, matrix);
+                        
+                        outDist.close();
+                    }
+                }
             }
                }
 
@@ -820,38 +877,8 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                 outSTD.close();
                 
             }
-        }else {
-            if (createPhylip) {
-                for (int i = 0; i < calcDists.size(); i++) {
-                    if (m->control_pressed) { break; }
-                    
-                    //initialize matrix
-                    vector< vector<double> > matrix; //square matrix to represent the distance
-                    matrix.resize(thisLookup.size());
-                    for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
-                    
-                    for (int j = 0; j < calcDists[i].size(); j++) {
-                        int row = calcDists[i][j].seq1;
-                        int column = calcDists[i][j].seq2;
-                        double dist = calcDists[i][j].dist;
-                        
-                        matrix[row][column] = dist;
-                        matrix[column][row] = dist;
-                    }
-                    
-                    string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
-                    outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
-                    ofstream outDist;
-                    m->openOutputFile(distFileName, outDist);
-                    outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-                    
-                    printSims(outDist, matrix);
-                    
-                    outDist.close();
-                }
-            }
         }
-
+        
         return 0;
        }
        catch(exception& e) {