Merge remote-tracking branch 'mothur/master'

author Pat Schloss <pschloss@umich.edu>

Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)

committer Pat Schloss <pschloss@umich.edu>

Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)
author Pat Schloss <pschloss@umich.edu>
Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)
committer Pat Schloss <pschloss@umich.edu>
Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp

index 4e244905ff6b755fd064617368506b9160770048..b6dc24fe5751867c6b79f71fb50e3b7b74ad75a6 100644 (file)
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -465,8 +465,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                      }
                  }
              }
-                       
-               }
+        }
         }
         catch(exception& e) {
                 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
@@ -503,11 +502,17 @@ int ClassifySeqsCommand::execute(){
                         string baseTName = taxonomyFileName;
                         if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy();  }
                         
-                       string RippedTaxName = m->getRootName(m->getSimpleName(baseTName));
-                       RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
-                       if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
-                       if (RippedTaxName != "") { RippedTaxName +=  "."; }
-           
+            //set rippedTaxName to 
+                       string RippedTaxName = "";
+            bool foundDot = false;
+            for (int i = baseTName.length()-1; i >= 0; i--) {
+                cout << baseTName[i] << endl;
+                if (foundDot && (baseTName[i] != '.')) {  RippedTaxName = baseTName[i] + RippedTaxName; }
+                else if (foundDot && (baseTName[i] == '.')) {  break; }
+                else if (!foundDot && (baseTName[i] == '.')) {  foundDot = true; }
+            }
+            if (RippedTaxName != "") { RippedTaxName +=  "."; }   
+          
                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
                         string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
                         string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "flip.accnos";
diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp

index 87929a4d9f8d8c91998e6e9d9760a22c67c60df0..05cd18a720e567e3e7c1852b436b68b1a38814f0 100644 (file)
--- a/matrixoutputcommand.cpp
+++ b/matrixoutputcommand.cpp
@@ -462,11 +462,11 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                 vector< vector< vector<seqDist> > > calcDistsTotals;  //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
          vector< vector<seqDist>  > calcDists; calcDists.resize(matrixCalculators.size());              
  
-        for (int thisIter = 0; thisIter < iters; thisIter++) {
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
              
              vector<SharedRAbundVector*> thisItersLookup = thisLookup;
              
-            if (subsample) {
+            if (subsample && (thisIter != 0)) {
                  SubSample sample;
                  vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
                  
@@ -619,14 +619,40 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                  #endif
              }
              
-            calcDistsTotals.push_back(calcDists);
-            
-            if (subsample) {  
-                
+            if (subsample && (thisIter != 0)) {  
+                calcDistsTotals.push_back(calcDists);
                  //clean up memory
                  for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                  thisItersLookup.clear();
                  for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
+            }else { //print results for whole dataset
+                for (int i = 0; i < calcDists.size(); i++) {
+                    if (m->control_pressed) { break; }
+                    
+                    //initialize matrix
+                    vector< vector<double> > matrix; //square matrix to represent the distance
+                    matrix.resize(thisLookup.size());
+                    for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+                    
+                    for (int j = 0; j < calcDists[i].size(); j++) {
+                        int row = calcDists[i][j].seq1;
+                        int column = calcDists[i][j].seq2;
+                        double dist = calcDists[i][j].dist;
+                        
+                        matrix[row][column] = dist;
+                        matrix[column][row] = dist;
+                    }
+                    
+                    string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
+                    outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+                    ofstream outDist;
+                    m->openOutputFile(distFileName, outDist);
+                    outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+                    
+                    printSims(outDist, matrix);
+                    
+                    outDist.close();
+                }
              }
                 }
                 
@@ -729,35 +755,6 @@ int MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                  outSTD.close();
  
              }
-        }else {
-        
-            for (int i = 0; i < calcDists.size(); i++) {
-                if (m->control_pressed) { break; }
-                
-                //initialize matrix
-                vector< vector<double> > matrix; //square matrix to represent the distance
-                matrix.resize(thisLookup.size());
-                for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
-                
-                for (int j = 0; j < calcDists[i].size(); j++) {
-                    int row = calcDists[i][j].seq1;
-                    int column = calcDists[i][j].seq2;
-                    double dist = calcDists[i][j].dist;
-                    
-                    matrix[row][column] = dist;
-                    matrix[column][row] = dist;
-                }
-                
-                string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
-                outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
-                ofstream outDist;
-                m->openOutputFile(distFileName, outDist);
-                outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-                
-                printSims(outDist, matrix);
-                
-                outDist.close();
-            }
          }
                 
                 return 0;
diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp

index 652ff4e49ecf163fc71dad48b40eb1485012b105..0fdd0798e2cdda014a6cc568d3a1c895362c3158 100644 (file)
--- a/rarefactcommand.cpp
+++ b/rarefactcommand.cpp
@@ -285,13 +285,8 @@ int RareFactCommand::execute(){
          map<string, set<int> > labelToEnds;
                 if ((format != "sharedfile")) { inputFileNames.push_back(inputfile);  }
                 else {  inputFileNames = parseSharedFile(sharedfile, labelToEnds);  format = "rabund"; }
-               for (map<string, set<int> >::iterator it = labelToEnds.begin(); it != labelToEnds.end(); it++) {
-            cout << it->first << endl;
-            for (set<int>::iterator its = (it->second).begin(); its != (it->second).end(); its++) {
-                cout << (*its) << endl;
-            }
-        }
-               if (m->control_pressed) { return 0; }
+        
+        if (m->control_pressed) { return 0; }
                 
                 map<int, string> file2Group; //index in outputNames[i] -> group
                 for (int p = 0; p < inputFileNames.size(); p++) {
diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp

index 726ddd61b75c6185cca6b4ebac108b7dd1b4d44f..fecf972c7ec99f80dda6f921706c6916a9d1679f 100644 (file)
--- a/rarefactsharedcommand.cpp
+++ b/rarefactsharedcommand.cpp
@@ -24,7 +24,8 @@ vector<string> RareFactSharedCommand::setParameters(){
                 CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
          CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode);
+        CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
                 vector<string> myArray;
@@ -41,8 +42,7 @@ string RareFactSharedCommand::getHelpString(){
         try {
                 string helpString = "";
                 ValidCalculators validCalculator;
-               helpString += "The collect.shared command parameters are shared, label, freq, calc and groups.  shared is required if there is no current sharedfile. \n";
-               helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble and calc.  shared is required if there is no current sharedfile. \n";
+               helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble, groupmode and calc.  shared is required if there is no current sharedfile. \n";
          helpString += "The design parameter allows you to assign your groups to sets. If provided mothur will run rarefaction.shared on a per set basis. \n";
          helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
                 helpString += "The rarefaction command should be in the following format: \n";
@@ -196,6 +196,9 @@ RareFactSharedCommand::RareFactSharedCommand(string option)  {
                         if (m->isTrue(temp)) { jumble = true; }
                         else { jumble = false; }
                         m->jumble = jumble;
+            
+            temp = validParameter.validFile(parameters, "groupmode", false);           if (temp == "not found") { temp = "T"; }
+                       groupMode = m->isTrue(temp);
                                 
                 }
  
@@ -226,6 +229,8 @@ int RareFactSharedCommand::execute(){
              for (int i = 0; i < Sets.size(); i++) {
                  process(designMap, Sets[i]);
              }
+            
+            if (groupMode) { outputNames = createGroupFile(outputNames); }
          }
                      
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
@@ -288,6 +293,7 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){
                                         outputNames.push_back(fileNameRoot+"shared.r_nseqs"); outputTypes["sharedr_nseqs"].push_back(fileNameRoot+"shared.r_nseqs");
                                 }
                         }
+            file2Group[outputNames.size()-1] = thisSet;
                 }
                 
                 //if the users entered no valid calculators don't execute command
@@ -432,3 +438,161 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){
         }
  }
  //**********************************************************************************************************************
+vector<string> RareFactSharedCommand::createGroupFile(vector<string>& outputNames) {
+       try {
+               
+               vector<string> newFileNames;
+               
+               //find different types of files
+               map<string, map<string, string> > typesFiles;
+        map<string, vector< vector<string> > > fileLabels; //combofile name to labels. each label is a vector because it may be unique lci hci.
+        vector<string> groupNames;
+               for (int i = 0; i < outputNames.size(); i++) {
+            
+                       string extension = m->getExtension(outputNames[i]);
+            string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+                       m->mothurRemove(combineFileName); //remove old file
+            
+                       ifstream in;
+                       m->openInputFile(outputNames[i], in);
+                       
+                       string labels = m->getline(in);
+            
+                       istringstream iss (labels,istringstream::in);
+            string newLabel = ""; vector<string> theseLabels;
+            while(!iss.eof()) {  iss >> newLabel; m->gobble(iss); theseLabels.push_back(newLabel); }
+            vector< vector<string> > allLabels;
+            vector<string> thisSet; thisSet.push_back(theseLabels[0]); allLabels.push_back(thisSet); thisSet.clear(); //makes "numSampled" its own grouping
+            for (int j = 1; j < theseLabels.size()-1; j++) {
+                if (theseLabels[j+1] == "lci") {
+                    thisSet.push_back(theseLabels[j]); 
+                    thisSet.push_back(theseLabels[j+1]); 
+                    thisSet.push_back(theseLabels[j+2]);
+                    j++; j++;
+                }else{ //no lci or hci for this calc.
+                    thisSet.push_back(theseLabels[j]); 
+                }
+                allLabels.push_back(thisSet); 
+                thisSet.clear();
+            }
+            fileLabels[combineFileName] = allLabels;
+            
+            map<string, map<string, string> >::iterator itfind = typesFiles.find(extension);
+            if (itfind != typesFiles.end()) {
+                (itfind->second)[outputNames[i]] = file2Group[i];
+            }else {
+                map<string, string> temp;  
+                temp[outputNames[i]] = file2Group[i];
+                typesFiles[extension] = temp;
+            }
+            if (!(m->inUsersGroups(file2Group[i], groupNames))) {  groupNames.push_back(file2Group[i]); }
+               }
+               
+               //for each type create a combo file
+               
+               for (map<string, map<string, string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
+                       
+                       ofstream out;
+                       string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
+                       m->openOutputFileAppend(combineFileName, out);
+                       newFileNames.push_back(combineFileName);
+                       map<string, string> thisTypesFiles = it->second; //it->second maps filename to group
+            set<int> numSampledSet;
+            
+                       //open each type summary file
+                       map<string, map<int, vector< vector<string> > > > files; //maps file name to lines in file
+                       int maxLines = 0;
+                       for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
+                
+                string thisfilename = itFileNameGroup->first;
+                string group = itFileNameGroup->second;
+                
+                               ifstream temp;
+                               m->openInputFile(thisfilename, temp);
+                               
+                               //read through first line - labels
+                               m->getline(temp);       m->gobble(temp);
+                               
+                               map<int, vector< vector<string> > > thisFilesLines;
+                               while (!temp.eof()){
+                    int numSampled = 0;
+                    temp >> numSampled; m->gobble(temp);
+                    
+                    vector< vector<string> > theseReads;
+                    vector<string> thisSet; thisSet.push_back(toString(numSampled)); theseReads.push_back(thisSet); thisSet.clear();
+                    for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
+                        vector<string> reads;
+                        string next = "";
+                        for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
+                            temp >> next; m->gobble(temp);
+                            reads.push_back(next);
+                        }
+                        theseReads.push_back(reads);
+                    }
+                    thisFilesLines[numSampled] = theseReads;
+                    m->gobble(temp);
+                    
+                    numSampledSet.insert(numSampled);
+                               }
+                               
+                               files[group] = thisFilesLines;
+                               
+                               //save longest file for below
+                               if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
+                               
+                               temp.close();
+                               m->mothurRemove(thisfilename);
+                       }
+                       
+            //output new labels line
+            out << fileLabels[combineFileName][0][0] << '\t';
+            for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
+                for (int n = 0; n < groupNames.size(); n++) { // for each group
+                    for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
+                        out << fileLabels[combineFileName][k][l] << '-' << groupNames[n] << '\t';
+                    }
+                }
+            }
+                       out << endl;
+            
+                       //for each label
+                       for (set<int>::iterator itNumSampled = numSampledSet.begin(); itNumSampled != numSampledSet.end(); itNumSampled++) {
+                               
+                out << (*itNumSampled) << '\t';
+                
+                if (m->control_pressed) { break; }
+                
+                for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //each chunk
+                                   //grab data for each group
+                    for (map<string, map<int, vector< vector<string> > > >::iterator itFileNameGroup = files.begin(); itFileNameGroup != files.end(); itFileNameGroup++) {
+                        
+                        string group = itFileNameGroup->first;
+                        
+                        map<int, vector< vector<string> > >::iterator itLine = files[group].find(*itNumSampled);
+                        if (itLine != files[group].end()) { 
+                            for (int l = 0; l < (itLine->second)[k].size(); l++) { 
+                                out << (itLine->second)[k][l] << '\t';
+                                
+                            }                             
+                        }else { 
+                            for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { 
+                                out << "NA" << '\t';
+                            } 
+                        }
+                    }
+                }
+                out << endl;
+                       }       
+                       out.close();
+               }
+               
+               //return combine file name
+               return newFileNames;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RareFactSharedCommand", "createGroupFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
diff --git a/rarefactsharedcommand.h b/rarefactsharedcommand.h

index 1a2d944894feabe26b4103e5431c0da08c6574b4..a210574002969c411be4d9e9c705af9d88c980f0 100644 (file)
--- a/rarefactsharedcommand.h
+++ b/rarefactsharedcommand.h
@@ -40,12 +40,14 @@ private:
         string format;
         float freq;
         
-       bool abort, allLines, jumble;
+     map<int, string> file2Group; //index in outputNames[i] -> group
+       bool abort, allLines, jumble, groupMode;
         set<string> labels; //holds labels to be used
         string label, calc, groups, outputDir, sharedfile, designfile;
         vector<string>  Estimators, Groups, outputNames, Sets;
      
      int process(GroupMap&, string);
+    vector<string> createGroupFile(vector<string>&);
  
  };
  
diff --git a/shhhercommand.cpp b/shhhercommand.cpp

index be410d9d9f4ecec2fc885a239c9ba57b5f409e89..08bb017a4d025113c7317167e3eb6217fa57638d 100644 (file)
--- a/shhhercommand.cpp
+++ b/shhhercommand.cpp
@@ -1934,12 +1934,14 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                 
                 //divide the groups between the processors
                 vector<linePair> lines;
+        vector<int> numFilesToComplete;
                 int numFilesPerProcessor = filenames.size() / processors;
                 for (int i = 0; i < processors; i++) {
                         int startIndex =  i * numFilesPerProcessor;
                         int endIndex = (i+1) * numFilesPerProcessor;
                         if(i == (processors - 1)){      endIndex = filenames.size();    }
                         lines.push_back(linePair(startIndex, endIndex));
+            numFilesToComplete.push_back((endIndex-startIndex));
                 }
                 
          #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)         
@@ -1953,6 +1955,14 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                                 process++;
                         }else if (pid == 0){
                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
+                
+                //pass numSeqs to parent
+                               ofstream out;
+                               string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
+                               m->openOutputFile(tempFile, out);
+                               out << num << endl;
+                               out.close();
+                
                                 exit(0);
                         }else { 
                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
@@ -2015,6 +2025,18 @@ int ShhherCommand::createProcesses(vector<string> filenames){
          #endif
          
          for (int i=0;i<processIDS.size();i++) { 
+            ifstream in;
+                       string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
+                       m->openInputFile(tempFile, in);
+                       if (!in.eof()) { 
+                int tempNum = 0; 
+                in >> tempNum; 
+                if (tempNum != numFilesToComplete[i+1]) {
+                    m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
+                }
+            }
+                       in.close(); m->mothurRemove(tempFile);
+            
              if (compositeFASTAFileName != "") {
                  m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
                  m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
@@ -2083,6 +2105,8 @@ vector<string> ShhherCommand::parseFlowFiles(string filename){
  int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
      try {
          
+        int numCompleted = 0;
+        
          for(int i=start;i<end;i++){
                         
                         if (m->control_pressed) { break; }
@@ -2281,12 +2305,13 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                  }
                         }
              
+            numCompleted++;
                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                 }
                 
          if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
          
-        return 0;
+        return numCompleted;
          
      }catch(exception& e) {
              m->errorOut(e, "ShhherCommand", "driver");
diff --git a/subsample.cpp b/subsample.cpp

index 457b7b9d5f6a1273b57e898b3adec90dff5dd9f0..f7da25fed8d72fce904d425c713fe43d92f2637c 100644 (file)
--- a/subsample.cpp
+++ b/subsample.cpp
@@ -322,8 +322,51 @@ int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+int SubSample::getSample(SAbundVector*& sabund, int size) {
+       try {
+               
+        OrderVector* order = new OrderVector();
+        *order = sabund->getOrderVector(NULL);
+        
+               int numBins = order->getNumBins();
+               int thisSize = order->getNumSeqs();
+        
+               if (thisSize > size) {
+                       random_shuffle(order->begin(), order->end());
+                       
+            RAbundVector* rabund = new RAbundVector(numBins);
+                       rabund->setLabel(order->getLabel());
  
-
+                       for (int j = 0; j < size; j++) {
+                
+                               if (m->control_pressed) { delete order; delete rabund; return 0; }
+                               
+                               int bin = order->get(j);
+                               
+                               int abund = rabund->get(bin);
+                               rabund->set(bin, (abund+1));
+                       }
+                       
+            delete sabund;
+            sabund = new SAbundVector();
+            *sabund = rabund->getSAbundVector();
+            delete rabund;
+            
+               }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
+               
+               if (m->control_pressed) { return 0; }
+        
+               delete order;
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSampleCommand", "getSample");
+               exit(1);
+       }
+}                      
  //**********************************************************************************************************************
  
  
diff --git a/subsample.h b/subsample.h

index feca37eddbfbf42cd4d7674adea5a18ceed3acee..b00f1a7141b49fb99a9683a58b96153c5986a6b7 100644 (file)
--- a/subsample.h
+++ b/subsample.h
@@ -27,6 +27,7 @@ class SubSample {
          
          //Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
          Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map<string, string>); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
+        int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it
      
      private:
      
diff --git a/summarycommand.cpp b/summarycommand.cpp

index 83ea8c9b4b5ab077283454089b35d9de13908003..85f0970f25930f563616c5fba9bc27fecda9cf98 100644 (file)
--- a/summarycommand.cpp
+++ b/summarycommand.cpp
@@ -33,6 +33,7 @@
  #include "boneh.h"
  #include "solow.h"
  #include "shen.h"
+#include "subsample.h"
  
  //**********************************************************************************************************************
  vector<string> SummaryCommand::setParameters(){        
@@ -41,6 +42,8 @@ vector<string> SummaryCommand::setParameters(){
                 CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(prabund);
                 CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(psabund);
                 CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(pshared);
+        CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+        CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "",true,false); parameters.push_back(pcalc);
                 CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund);
@@ -63,11 +66,13 @@ string SummaryCommand::getHelpString(){
         try {
                 string helpString = "";
                 ValidCalculators validCalculator;
-               helpString += "The summary.single command parameters are list, sabund, rabund, shared, label, calc, abund and groupmode.  list, sabund, rabund or shared is required unless you have a valid current file.\n";
+               helpString += "The summary.single command parameters are list, sabund, rabund, shared, subsample, iters, label, calc, abund and groupmode.  list, sabund, rabund or shared is required unless you have a valid current file.\n";
                 helpString += "The summary.single command should be in the following format: \n";
                 helpString += "summary.single(label=yourLabel, calc=yourEstimators).\n";
                 helpString += "Example summary.single(label=unique-.01-.03, calc=sobs-chao-ace-jack-bootstrap-shannon-npshannon-simpson).\n";
                 helpString += validCalculator.printCalc("summary");
+        helpString += "The subsample parameter allows you to enter the size of the sample or you can set subsample=T and mothur will use the size of your smallest group in the case of a shared file. With a list, sabund or rabund file you must provide a subsample size.\n";
+        helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
                 helpString += "The default value calc is sobs-chao-ace-jack-shannon-npshannon-simpson\n";
                 helpString += "If you are running summary.single with a shared file and would like your summary results collated in one file, set groupmode=t. (Default=true).\n";
                 helpString += "The label parameter is used to analyze specific labels in your input.\n";
@@ -239,7 +244,22 @@ SummaryCommand::SummaryCommand(string option)  {
                         temp = validParameter.validFile(parameters, "groupmode", false);                if (temp == "not found") { temp = "T"; }
                         groupMode = m->isTrue(temp);
                         
-       
+            temp = validParameter.validFile(parameters, "iters", false);                       if (temp == "not found") { temp = "1000"; }
+                       m->mothurConvert(temp, iters);
+            
+            temp = validParameter.validFile(parameters, "subsample", false);           if (temp == "not found") { temp = "F"; }
+                       if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+            else {  
+                if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
+                else { subsample = false; subsampleSize = -1; }
+            }
+            
+            if (subsample == false) { iters = 1; }
+            else {
+                //if you did not set a samplesize and are not using a sharedfile
+                if ((subsampleSize == -1) && (format != "sharedfile"))  { m->mothurOut("[ERROR]: If you want to subsample with a list, rabund or sabund file, you must provide the sample size.  You can do this by setting subsample=yourSampleSize.\n");  abort=true; }
+            }
+
                 }
         }
         catch(exception& e) {
@@ -261,17 +281,23 @@ int SummaryCommand::execute(){
                 
                 int numLines = 0;
                 int numCols = 0;
-               
+               map<string, string> groupIndex;
+        
                 for (int p = 0; p < inputFileNames.size(); p++) {
                         
                         numLines = 0;
                         numCols = 0;
                         
                         string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "summary";
+            string fileNameAve = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "ave";
+            string fileNameSTD = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "std";
                         outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot);
+            
+            
                         
                         if (inputFileNames.size() > 1) {
                                 m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine();
+                groupIndex[fileNameRoot] = groups[p];
                         }
                         
                         sumCalculators.clear();
@@ -342,6 +368,21 @@ int SummaryCommand::execute(){
                         ofstream outputFileHandle;
                         m->openOutputFile(fileNameRoot, outputFileHandle);
                         outputFileHandle << "label";
+            
+            ofstream outAve, outSTD;
+            if (subsample) {
+                m->openOutputFile(fileNameAve, outAve);
+                m->openOutputFile(fileNameSTD, outSTD);
+                outputNames.push_back(fileNameAve); outputTypes["ave"].push_back(fileNameAve);
+                outputNames.push_back(fileNameSTD); outputTypes["std"].push_back(fileNameSTD);
+                outAve << "label"; outSTD << "label";
+                outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint);
+                outSTD.setf(ios::fixed, ios::floatfield); outSTD.setf(ios::showpoint);
+                if (inputFileNames.size() > 1) {
+                    groupIndex[fileNameAve] = groups[p];
+                    groupIndex[fileNameSTD] = groups[p];
+                }
+            }
                 
                         input = new InputData(inputFileNames[p], format);
                         sabund = input->getSAbundVector();
@@ -350,24 +391,29 @@ int SummaryCommand::execute(){
                         for(int i=0;i<sumCalculators.size();i++){
                                 if(sumCalculators[i]->getCols() == 1){
                                         outputFileHandle << '\t' << sumCalculators[i]->getName();
+                    if (subsample) { outAve << '\t' << sumCalculators[i]->getName(); outSTD << '\t' << sumCalculators[i]->getName(); }
                                         numCols++;
                                 }
                                 else{
                                         outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci";
+                    if (subsample) { outAve << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; outSTD << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
                                         numCols += 3;
                                 }
                         }
                         outputFileHandle << endl;
+            if (subsample) { outSTD << endl; outAve << endl; }
                         
                         //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                         set<string> processedLabels;
                         set<string> userLabels = labels;
                         
-                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
+            
+            
+                       if (m->control_pressed) {  outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                         
                         while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                                 
-                               if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
+                               if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                 
                                 if(allLines == 1 || labels.count(sabund->getLabel()) == 1){                     
                                         
@@ -375,16 +421,9 @@ int SummaryCommand::execute(){
                                         processedLabels.insert(sabund->getLabel());
                                         userLabels.erase(sabund->getLabel());
                                         
-                                       outputFileHandle << sabund->getLabel();
-                                       for(int i=0;i<sumCalculators.size();i++){
-                                               vector<double> data = sumCalculators[i]->getValues(sabund);
-                                               
-                                               if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
-
-                                               outputFileHandle << '\t';
-                                               sumCalculators[i]->print(outputFileHandle);
-                                       }
-                                       outputFileHandle << endl;
+                    process(sabund, outputFileHandle, outAve, outSTD);
+                    
+                    if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                         numLines++;
                                 }
                                 
@@ -398,16 +437,9 @@ int SummaryCommand::execute(){
                                         processedLabels.insert(sabund->getLabel());
                                         userLabels.erase(sabund->getLabel());
                                         
-                                       outputFileHandle << sabund->getLabel();
-                                       for(int i=0;i<sumCalculators.size();i++){
-                                               vector<double> data = sumCalculators[i]->getValues(sabund);
-                                               
-                                               if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; } delete sabund;  delete input;  return 0;  }
-                                               
-                                               outputFileHandle << '\t';
-                                               sumCalculators[i]->print(outputFileHandle);
-                                       }
-                                       outputFileHandle << endl;
+                    process(sabund, outputFileHandle, outAve, outSTD);
+                    
+                    if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                         numLines++;
                                         
                                         //restore real lastlabel to save below
@@ -420,7 +452,7 @@ int SummaryCommand::execute(){
                                 sabund = input->getSAbundVector();
                         }
                         
-                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
+                       if (m->control_pressed) {  outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
  
                         //output error messages about any remaining user labels
                         set<string>::iterator it;
@@ -441,21 +473,15 @@ int SummaryCommand::execute(){
                                 sabund = input->getSAbundVector(lastLabel);
                                 
                                 m->mothurOut(sabund->getLabel()); m->mothurOutEndLine();
-                               outputFileHandle << sabund->getLabel();
-                               for(int i=0;i<sumCalculators.size();i++){
-                                       vector<double> data = sumCalculators[i]->getValues(sabund);
-                                       
-                                       if (m->control_pressed) {  outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input; return 0;  }
-
-                                       outputFileHandle << '\t';
-                                       sumCalculators[i]->print(outputFileHandle);
-                               }
-                               outputFileHandle << endl;
+                process(sabund, outputFileHandle, outAve, outSTD);
+                
+                if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }  delete sabund;  delete input;  return 0;  }
                                 numLines++;
                                 delete sabund;
                         }
                         
                         outputFileHandle.close();
+            if (subsample) { outAve.close(); outSTD.close(); }
                         
                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } for(int i=0;i<sumCalculators.size();i++){  delete sumCalculators[i]; }   delete input;  return 0;  }
  
@@ -467,7 +493,7 @@ int SummaryCommand::execute(){
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }  return 0;  }
                 
                 //create summary file containing all the groups data for each label - this function just combines the info from the files already created.
-               if ((sharedfile != "") && (groupMode)) {   outputNames.push_back(createGroupSummaryFile(numLines, numCols, outputNames));  }
+               if ((sharedfile != "") && (groupMode)) {   vector<string> comboNames = createGroupSummaryFile(numLines, numCols, outputNames, groupIndex);  for (int i = 0; i < comboNames.size(); i++) { outputNames.push_back(comboNames[i]); } }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }  return 0;  }
                 
@@ -484,6 +510,108 @@ int SummaryCommand::execute(){
         }
  }
  //**********************************************************************************************************************
+int SummaryCommand::process(SAbundVector*& sabund, ofstream& outputFileHandle, ofstream& outAve, ofstream& outStd) {
+    try {
+        
+        //calculator -> data -> values
+        vector< vector< vector<double> > >  results; results.resize(sumCalculators.size());
+        
+        outputFileHandle << sabund->getLabel();
+        
+        SubSample sample;
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
+            
+            SAbundVector* thisIterSabund = sabund;
+            
+            //we want the summary results for the whole dataset, then the subsampling
+            if ((thisIter > 0) && subsample) { //subsample sabund and run it
+                //copy sabund since getSample destroys it
+                RAbundVector rabund = sabund->getRAbundVector();
+                SAbundVector* newSabund = new SAbundVector();
+                *newSabund = rabund.getSAbundVector();
+                
+                sample.getSample(newSabund, subsampleSize);
+                thisIterSabund = newSabund;
+            }
+            
+            for(int i=0;i<sumCalculators.size();i++){
+                vector<double> data = sumCalculators[i]->getValues(thisIterSabund);
+               
+                if (m->control_pressed) {  return 0;  }
+                
+                if (thisIter == 0) {
+                    outputFileHandle << '\t';
+                    sumCalculators[i]->print(outputFileHandle);
+                }else {
+                    //some of the calc have hci and lci need to make room for that
+                    if (results[i].size() == 0) {  results[i].resize(data.size());  }
+                    //save results for ave and std.
+                    for (int j = 0; j < data.size(); j++) {
+                        if (m->control_pressed) {  return 0;  }
+                        results[i][j].push_back(data[j]); 
+                    }
+                }
+            }
+            
+            //cleanup memory
+            if ((thisIter > 0) && subsample) { delete thisIterSabund; }
+        }
+        outputFileHandle << endl;
+     
+        if (subsample) {
+            outAve << sabund->getLabel() << '\t'; outStd << sabund->getLabel() << '\t';
+            //find ave and std for this label and output
+            //will need to modify the createGroupSummary to combine results and not mess with the .summary file.
+            
+            //calcs -> values
+            vector< vector<double> >  calcAverages; calcAverages.resize(sumCalculators.size()); 
+            for (int i = 0; i < calcAverages.size(); i++) {  calcAverages[i].resize(results[i].size(), 0);  }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+                for (int i = 0; i < calcAverages.size(); i++) {  //initialize sums to zero.
+                    for (int j = 0; j < calcAverages[i].size(); j++) {
+                        calcAverages[i][j] += results[i][j][thisIter];
+                    }
+                }
+            }
+            
+            for (int i = 0; i < calcAverages.size(); i++) {  //finds average.
+                for (int j = 0; j < calcAverages[i].size(); j++) {
+                    calcAverages[i][j] /= (float) iters;
+                    outAve << calcAverages[i][j] << '\t';
+                }
+            }
+            
+            //find standard deviation
+            vector< vector<double>  > stdDev; stdDev.resize(sumCalculators.size());
+            for (int i = 0; i < stdDev.size(); i++) {  stdDev[i].resize(results[i].size(), 0);  }
+            
+            for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+                for (int i = 0; i < stdDev.size(); i++) {  
+                    for (int j = 0; j < stdDev[i].size(); j++) {
+                        stdDev[i][j] += ((results[i][j][thisIter] - calcAverages[i][j]) * (results[i][j][thisIter] - calcAverages[i][j]));
+                    }
+                }
+            }
+            
+            for (int i = 0; i < stdDev.size(); i++) {  //finds average.
+                for (int j = 0; j < stdDev[i].size(); j++) {
+                    stdDev[i][j] /= (float) iters;
+                    stdDev[i][j] = sqrt(stdDev[i][j]);
+                    outStd << stdDev[i][j] << '\t';
+                }
+            }
+            outAve << endl;  outStd << endl; 
+        }
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SummaryCommand", "process");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
  vector<string> SummaryCommand::parseSharedFile(string filename) {
         try {
                 vector<string> filenames;
@@ -496,12 +624,44 @@ vector<string> SummaryCommand::parseSharedFile(string filename) {
                 
                 string sharedFileRoot = m->getRootName(filename);
                 
-               //clears file before we start to write to it below
+        /******************************************************/
+        if (subsample) { 
+            if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+                subsampleSize = lookup[0]->getNumSeqs();
+                for (int i = 1; i < lookup.size(); i++) {
+                    int thisSize = lookup[i]->getNumSeqs();
+                    
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+            }else {
+                m->clearGroups();
+                vector<string> Groups;
+                vector<SharedRAbundVector*> temp;
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (lookup[i]->getNumSeqs() < subsampleSize) { 
+                        m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                        delete lookup[i];
+                    }else { 
+                        Groups.push_back(lookup[i]->getGroup()); 
+                        temp.push_back(lookup[i]);
+                    }
+                } 
+                lookup = temp;
+                m->setGroups(Groups);
+            }
+            
+            if (lookup.size() < 1) { m->mothurOut("You have not provided enough valid groups.  I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return filenames; }
+        }
+        
+               
+               /******************************************************/
+        
+        //clears file before we start to write to it below
                 for (int i=0; i<lookup.size(); i++) {
                         m->mothurRemove((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
                         filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
                 }
-               
+        
                 ofstream* temp;
                 for (int i=0; i<lookup.size(); i++) {
                         temp = new ofstream;
@@ -537,21 +697,20 @@ vector<string> SummaryCommand::parseSharedFile(string filename) {
         }
  }
  //**********************************************************************************************************************
-string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames) {
+vector<string> SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames, map<string, string> groupIndex) {
         try {
-               
-               ofstream out;
-               string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups.summary";
-               
-               //open combined file
-               m->openOutputFile(combineFileName, out);
-               
+                               
                 //open each groups summary file
+        vector<string> newComboNames;
                 string newLabel = "";
-               map<string, vector<string> > files;
+               map<string, map<string, vector<string> > > files;
                 for (int i=0; i<outputNames.size(); i++) {
+            string extension = m->getExtension(outputNames[i]);
+            string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+                       m->mothurRemove(combineFileName); //remove old file
+             
                         vector<string> thisFilesLines;
-                       
+            
                         ifstream temp;
                         m->openInputFile(outputNames[i], temp);
                         
@@ -578,7 +737,7 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<
                                         temp >> tempLabel; 
                                                 
                                         //save for later
-                                       if (j == 1) { thisLine += groups[i] + "\t" + tempLabel + "\t";  }
+                                       if (j == 1) { thisLine += groupIndex[outputNames[i]] + "\t" + tempLabel + "\t"; }
                                         else{  thisLine += tempLabel + "\t";    }
                                 }
                                         
@@ -588,31 +747,52 @@ string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<
                                         
                                 m->gobble(temp);
                         }
-                               
-                       files[outputNames[i]] = thisFilesLines;
+            
+            map<string, map<string, vector<string> > >::iterator itFiles = files.find(extension);
+            if (itFiles != files.end()) { //add new files info to existing type
+                files[extension][outputNames[i]] = thisFilesLines;
+            }else {
+                map<string, vector<string> > thisFile;
+                thisFile[outputNames[i]] = thisFilesLines;
+                files[extension] = thisFile;
+            }
                         
                         temp.close();
                         m->mothurRemove(outputNames[i]);
                 }
                 
-               //output label line to new file
-               out << newLabel << endl;
-               
-               //for each label
-               for (int k = 0; k < numLines; k++) {
-               
-                       //grab summary data for each group
-                       for (int i=0; i<outputNames.size(); i++) {
-                               out << files[outputNames[i]][k];
-                       }
-               }       
-               
-               outputNames.clear();
-               
-               out.close();
+        
+        for (map<string, map<string, vector<string> > >::iterator itFiles = files.begin(); itFiles != files.end(); itFiles++) {
+            
+            if (m->control_pressed) { break; }
+            
+            string extension = itFiles->first;
+            map<string, vector<string> > thisType = itFiles->second;
+            string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+            newComboNames.push_back(combineFileName);
+            //open combined file
+            ofstream out;
+            m->openOutputFile(combineFileName, out);
+            
+            //output label line to new file
+            out << newLabel << endl;
+               
+            //for each label
+            for (int k = 0; k < numLines; k++) {
+               
+                //grab summary data for each group
+                for (map<string, vector<string> >::iterator itType = thisType.begin(); itType != thisType.end(); itType++) {
+                    out << (itType->second)[k];
+                }
+            }  
+               
+            outputNames.clear();
+               
+            out.close();
+        }
                 
                 //return combine file name
-               return combineFileName;
+               return newComboNames;
                 
         }
         catch(exception& e) {
diff --git a/summarycommand.h b/summarycommand.h

index 15b7f4066e9f17b263b30ac3d0b663d7cd414455..1d93a333a8c971458ced6ca8803cb0b2027a48b2 100644 (file)
--- a/summarycommand.h
+++ b/summarycommand.h
@@ -37,9 +37,9 @@ private:
         vector<Calculator*> sumCalculators;     
         InputData* input;
         SAbundVector* sabund;
-       int abund, size;
+       int abund, size, iters, subsampleSize;
  
-       bool abort, allLines, groupMode;
+       bool abort, allLines, groupMode, subsample;
         set<string> labels; //holds labels to be used
         string label, calc, outputDir, sharedfile, listfile, rabundfile, sabundfile, format, inputfile;
         vector<string>  Estimators;
@@ -47,7 +47,8 @@ private:
         vector<string> groups;
         
         vector<string> parseSharedFile(string);
-       string createGroupSummaryFile(int, int, vector<string>&);
+       vector<string> createGroupSummaryFile(int, int, vector<string>&, map<string, string>);
+    int process(SAbundVector*&, ofstream&, ofstream&, ofstream&);
  
  
  };
diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp

index 77961f1c807387080b914af5059fea03d3844ff4..ed028045f9ff87713fc7d8a283805858eb2f4b01 100644 (file)
--- a/summarysharedcommand.cpp
+++ b/summarysharedcommand.cpp
@@ -365,11 +365,38 @@ int SummarySharedCommand::execute(){
                         return 0;
                 }
                 /******************************************************/
-               
+        if (subsample) { 
+            if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+                subsampleSize = lookup[0]->getNumSeqs();
+                for (int i = 1; i < lookup.size(); i++) {
+                    int thisSize = lookup[i]->getNumSeqs();
+                    
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+            }else {
+                m->clearGroups();
+                Groups.clear();
+                vector<SharedRAbundVector*> temp;
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (lookup[i]->getNumSeqs() < subsampleSize) { 
+                        m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                        delete lookup[i];
+                    }else { 
+                        Groups.push_back(lookup[i]->getGroup()); 
+                        temp.push_back(lookup[i]);
+                    }
+                } 
+                lookup = temp;
+                m->setGroups(Groups);
+            }
+            
+            if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups.  I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return 0; }
+        }
+
                 
                 /******************************************************/
                 //comparison breakup to be used by different processes later
-               numGroups = m->getNumGroups();
+               numGroups = lookup.size();
                 lines.resize(processors);
                 for (int i = 0; i < processors; i++) {
                         lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
@@ -527,11 +554,11 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
          vector< vector< vector<seqDist> > > calcDistsTotals;  //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
          vector< vector<seqDist>  > calcDists; calcDists.resize(sumCalculators.size());                 
          
-        for (int thisIter = 0; thisIter < iters; thisIter++) {
+        for (int thisIter = 0; thisIter < iters+1; thisIter++) {
              
              vector<SharedRAbundVector*> thisItersLookup = thisLookup;
              
-            if (subsample) {
+            if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
                  SubSample sample;
                  vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
                  
@@ -710,14 +737,44 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                  
  #endif
              }
-            calcDistsTotals.push_back(calcDists);
              
-            if (subsample) {  
+            if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
                  
+                calcDistsTotals.push_back(calcDists); 
                  //clean up memory
                  for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
                  thisItersLookup.clear();
                  for (int i = 0; i < calcDists.size(); i++) {  calcDists[i].clear(); }
+            }else {
+                if (createPhylip) {
+                    for (int i = 0; i < calcDists.size(); i++) {
+                        if (m->control_pressed) { break; }
+                        
+                        //initialize matrix
+                        vector< vector<double> > matrix; //square matrix to represent the distance
+                        matrix.resize(thisLookup.size());
+                        for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
+                        
+                        for (int j = 0; j < calcDists[i].size(); j++) {
+                            int row = calcDists[i][j].seq1;
+                            int column = calcDists[i][j].seq2;
+                            double dist = calcDists[i][j].dist;
+                            
+                            matrix[row][column] = dist;
+                            matrix[column][row] = dist;
+                        }
+                        
+                        string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
+                        outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+                        ofstream outDist;
+                        m->openOutputFile(distFileName, outDist);
+                        outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+                        
+                        printSims(outDist, matrix);
+                        
+                        outDist.close();
+                    }
+                }
              }
                 }
  
@@ -820,38 +877,8 @@ int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string
                  outSTD.close();
                  
              }
-        }else {
-            if (createPhylip) {
-                for (int i = 0; i < calcDists.size(); i++) {
-                    if (m->control_pressed) { break; }
-                    
-                    //initialize matrix
-                    vector< vector<double> > matrix; //square matrix to represent the distance
-                    matrix.resize(thisLookup.size());
-                    for (int k = 0; k < thisLookup.size(); k++) {  matrix[k].resize(thisLookup.size(), 0.0); }
-                    
-                    for (int j = 0; j < calcDists[i].size(); j++) {
-                        int row = calcDists[i][j].seq1;
-                        int column = calcDists[i][j].seq2;
-                        double dist = calcDists[i][j].dist;
-                        
-                        matrix[row][column] = dist;
-                        matrix[column][row] = dist;
-                    }
-                    
-                    string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel()  + "." + output + ".dist";
-                    outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
-                    ofstream outDist;
-                    m->openOutputFile(distFileName, outDist);
-                    outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-                    
-                    printSims(outDist, matrix);
-                    
-                    outDist.close();
-                }
-            }
          }
-
+        
          return 0;
         }
         catch(exception& e) {
diff --git a/trimoligos.cpp b/trimoligos.cpp

index 8c523ce68d8a042bb702900d468def2aadad157b..2f92cc847ca75e1a0983aeef2d09e9eaab68e57f 100644 (file)
--- a/trimoligos.cpp
+++ b/trimoligos.cpp
@@ -12,6 +12,29 @@
  #include "needlemanoverlap.hpp"
  
  
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+       try {
+               m = MothurOut::getInstance();
+               
+               pdiffs = p;
+               bdiffs = b;
+        ldiffs = l;
+        sdiffs = s;
+               
+               barcodes = br;
+        rbarcodes = rbr;
+               primers = pr;
+               revPrimer = r;
+        linker = lk;
+        spacer = sp;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "TrimOligos");
+               exit(1);
+       }
+}
  /********************************************************************/
  //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
  TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
@@ -129,8 +152,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
                                 }
                                 oligo = oligo.substr(0,alnLength);
                                 temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
+                int numDiff = countDiffs(oligo, temp);
                                 
                                 if(numDiff < minDiff){
                                         minDiff = numDiff;
@@ -237,7 +259,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
                                 oligo = alignment->getSeqAAln();
                                 string temp = alignment->getSeqBAln();
-                               
+                                
                                 int alnLength = oligo.length();
                                 
                                 for(int i=oligo.length()-1;i>=0;i--){
@@ -245,7 +267,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                                 }
                                 oligo = oligo.substr(0,alnLength);
                                 temp = temp.substr(0,alnLength);
-                               
+                                
                                 int numDiff = countDiffs(oligo, temp);
                                 
                                 if(numDiff < minDiff){
@@ -285,6 +307,239 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                 exit(1);
         }
         
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+            
+                       if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
+                               }
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (rbarcodes.size() > 0) {
+                               map<string,int>::iterator it; 
+                               
+                               for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+     
+                               int alnLength = oligo.length();
+                               
+                               for(int i=0;i<alnLength;i++){
+                                       if(oligo[i] != '-'){    alnLength = i;  break;  }
+                               }
+                               oligo = oligo.substr(alnLength);
+                               temp = temp.substr(alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=alnLength-1;i>=0;i--){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+                
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(-1, (rawSequence.length()-minPos));
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripRBarcode");
+               exit(1);
+       }
+       
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+            
+                       if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (rbarcodes.size() > 0) {
+                               map<string,int>::iterator it; 
+                               
+                               for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                
+                               int alnLength = oligo.length();
+                               
+                               for(int i=0;i<alnLength;i++){
+                                       if(oligo[i] != '-'){    alnLength = i;  break;  }
+                               }
+                               oligo = oligo.substr(alnLength);
+                               temp = temp.substr(alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=alnLength-1;i>=0;i--){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+                
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripRBarcode");
+               exit(1);
+       }
+       
  }
  //********************************************************************/
  int TrimOligos::stripForward(Sequence& seq, int& group){
diff --git a/trimoligos.h b/trimoligos.h

index e3ea7d55537e323f3869a6157ff3aed4bbd99eea..a32b3d8e4f2d388b15b3aa68ed66fa61f33c1681 100644 (file)
--- a/trimoligos.h
+++ b/trimoligos.h
@@ -20,11 +20,15 @@ class TrimOligos {
         
         public:
          TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
-               TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
                 ~TrimOligos();
         
                 int stripBarcode(Sequence&, int&);      
                 int stripBarcode(Sequence&, QualityScores&, int&);
+    
+        int stripRBarcode(Sequence&, int&);    
+        int stripRBarcode(Sequence&, QualityScores&, int&);
         
                 int stripForward(Sequence&, int&);
                 int stripForward(Sequence&, QualityScores&, int&, bool);
@@ -43,6 +47,7 @@ class TrimOligos {
                 int pdiffs, bdiffs, ldiffs, sdiffs;
         
                 map<string, int> barcodes;
+        map<string, int> rbarcodes;
                 map<string, int> primers;
                 vector<string> revPrimer;
          vector<string> linker;
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index 9f8fafb24e14a807baa8dd5a981c93556c46e3b7..c019a70e4a2a7d35374192b3f8cca11787e7ecef 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -562,7 +562,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                 
                 int count = 0;
                 bool moreSeqs = 1;
-               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
         
                 while (moreSeqs) {
                                 
@@ -607,6 +607,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                         if(success > bdiffs)            {       trashCode += 'b';       }
                                         else{ currentSeqsDiffs += success;  }
                                 }
+                
+                if(rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > bdiffs)            {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
                                 
                  if(numSpacers != 0){
                                         success = trimOligos.stripSpacer(currSeq, currQual);
@@ -946,7 +952,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                                tempPrimerQualFileNames,
                                                tempNameFileNames,
                                                lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
-                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, 
+                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, 
                                               primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
                                                qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
                                               minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
@@ -1187,7 +1193,6 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
                  int startIndex =  i * numSeqsPerProcessor;
                  if(i == (processors - 1)){     numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
                  lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
-                cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
                  if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
              }
          }
@@ -1262,7 +1267,29 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                 }
                                 else if(type == "BARCODE"){
                                         inOligos >> group;
+                    
+                    //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
+                    //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
+                                       string temp = "";
+                    while (!inOligos.eof())    {       
+                                               char c = inOligos.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  temp += c;  }
+                                       } 
                                         
+                    //then this is illumina data with 4 columns
+                    if (temp != "") {  
+                        string reverseBarcode = reverseOligo(group); //reverse barcode
+                        group = temp;
+                        
+                        //check for repeat barcodes
+                        map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+                        if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine();  }
+                                               
+                        rbarcodes[reverseBarcode]=indexBarcode; 
+                    }
+                        
                                         //check for repeat barcodes
                                         map<string, int>::iterator itBar = barcodes.find(oligo);
                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
diff --git a/trimseqscommand.h b/trimseqscommand.h

index 9ba64c4b62b7c7e87f213472b730a91bd0b37305..ba4e61411b8498820139e8ae2b1ed71cb37b0432 100644 (file)
--- a/trimseqscommand.h
+++ b/trimseqscommand.h
@@ -63,6 +63,7 @@ private:
         vector<string> revPrimer, outputNames;
         set<string> filesToRemove;
         map<string, int> barcodes;
+    map<string, int> rbarcodes;
         vector<string> groupVector;
         map<string, int> primers;
      vector<string>  linker;
@@ -101,6 +102,7 @@ struct trimData {
         double qRollAverage, qThreshold, qWindowAverage, qAverage;
      vector<string> revPrimer;
         map<string, int> barcodes;
+    map<string, int> rbarcodes;
         map<string, int> primers;
      vector<string>  linker;
      vector<string>  spacer;
@@ -112,7 +114,7 @@ struct trimData {
      
         trimData(){}
         trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
-                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
+                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa, 
                        vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
                        int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
                        int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
@@ -141,6 +143,7 @@ struct trimData {
          sdiffs = sd;
          tdiffs = td;
          barcodes = bar;
+        rbarcodes = rbar;
          primers = pri;      numFPrimers = primers.size();
          revPrimer = revP;   numRPrimers = revPrimer.size();
          linker = li;        numLinkers = linker.size();
@@ -237,7 +240,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                 }
                 
                 
-               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
          
                 pDataArray->count = pDataArray->lineEnd;
                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
@@ -277,7 +280,13 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
                                         else{ currentSeqsDiffs += success;  }
                                 }
-                               
+                
+                               if(pDataArray->rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
+                
                  if(pDataArray->numSpacers != 0){
                                         success = trimOligos.stripSpacer(currSeq, currQual);
                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
author	Pat Schloss <pschloss@umich.edu>
	Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)
committer	Pat Schloss <pschloss@umich.edu>
	Thu, 7 Jun 2012 00:06:41 +0000 (20:06 -0400)
classifyseqscommand.cpp		patch \| blob \| history
matrixoutputcommand.cpp		patch \| blob \| history
rarefactcommand.cpp		patch \| blob \| history
rarefactsharedcommand.cpp		patch \| blob \| history
rarefactsharedcommand.h		patch \| blob \| history
shhhercommand.cpp		patch \| blob \| history
subsample.cpp		patch \| blob \| history
subsample.h		patch \| blob \| history
summarycommand.cpp		patch \| blob \| history
summarycommand.h		patch \| blob \| history
summarysharedcommand.cpp		patch \| blob \| history
trimoligos.cpp		patch \| blob \| history
trimoligos.h		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history
trimseqscommand.h		patch \| blob \| history