added dereplicate parameter to chimera.slayer and chimera.persues. added minnumsample...

author Sarah Westcott <mothur.westcott@gmail.com>

Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)

committer Sarah Westcott <mothur.westcott@gmail.com>

Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)
author Sarah Westcott <mothur.westcott@gmail.com>
Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)
committer Sarah Westcott <mothur.westcott@gmail.com>
Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)
diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp

index 1e5f9d41b91d52d1f8b791ad82be5cffd7e28b6c..b4e478caea6bab2c3cf3a7d13a756f95a29f2223 100644 (file)
--- a/chimeraperseuscommand.cpp
+++ b/chimeraperseuscommand.cpp
@@ -20,6 +20,8 @@ vector<string> ChimeraPerseusCommand::setParameters(){
          CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+        CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups);
+
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
                 CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pcutoff);
@@ -40,13 +42,14 @@ string ChimeraPerseusCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
-               helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
+               helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
          helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
+        helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
                 helpString += "The alpha parameter ....  The default is -5.54. \n";
                 helpString += "The beta parameter ....  The default is 0.33. \n";
                 helpString += "The cutoff parameter ....  The default is 0.50. \n";
@@ -461,6 +464,13 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.33";  }
                         m->mothurConvert(temp, beta);
+            
+                       temp = validParameter.validFile(parameters, "dereplicate", false);      
+                       if (temp == "not found") { 
+                               if (groupfile != "")    {  temp = "false";                                      }
+                               else                    {  temp = "true";       }
+                       }
+                       dups = m->isTrue(temp);
                 }
         }
         catch(exception& e) {
@@ -524,7 +534,9 @@ int ChimeraPerseusCommand::execute(){
                      
                      if (m->control_pressed) {  delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                      map<string, string> uniqueNames = cparser->getAllSeqsMap();
-                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    if (!dups) { 
+                        numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    }
                      delete cparser;
  
                      m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
@@ -560,7 +572,9 @@ int ChimeraPerseusCommand::execute(){
                      
                      if (m->control_pressed) {  delete parser; for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                      map<string, string> uniqueNames = parser->getAllSeqsMap();
-                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    if (!dups) { 
+                        numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    }
                      delete parser;
                      
                      m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h

index fdece95e5c90b528c7fcc3d476a53f28d4ea7b3b..b3d6ccf56858deb00435ace345826450f7d4cd9e 100644 (file)
--- a/chimeraperseuscommand.h
+++ b/chimeraperseuscommand.h
@@ -46,7 +46,7 @@ private:
                 linePair(int i, int j) : start(i), end(j) {}
         };
         
-       bool abort, hasName, hasCount;
+       bool abort, hasName, hasCount, dups;
         string fastafile, groupfile, countfile, outputDir, namefile;
         int processors, alignLength;
         double cutoff, alpha, beta;
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp

index a29fc82727eb0d68d8d8a071d2e969dfab8d2103..576fee94250f780685bbc57cd20bf0478241f74e 100644 (file)
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -31,12 +31,14 @@ vector<string> ChimeraSlayerCommand::setParameters(){
                 CommandParameter pminbs("minbs", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminbs);
                 CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "","",false,false); parameters.push_back(psearch);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+        
                 CommandParameter prealign("realign", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prealign);
                 CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "","fasta",false,false); parameters.push_back(ptrim);
                 CommandParameter psplit("split", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psplit);
                 CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "","",false,false); parameters.push_back(pnumwanted);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
                 CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "","",false,false); parameters.push_back(pdivergence);
+        CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups);
                 CommandParameter pparents("parents", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pparents);
                 CommandParameter pincrement("increment", "Number", "", "5", "", "", "","",false,false); parameters.push_back(pincrement);
                 CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "","",false,false); parameters.push_back(pblastlocation);
@@ -59,7 +61,7 @@ string ChimeraSlayerCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
                 helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n";
-               helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n";
+               helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, dereplicate, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n";
                 helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
@@ -70,6 +72,7 @@ string ChimeraSlayerCommand::getHelpString(){
  #ifdef USE_MPI
                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  #endif
+        helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
                 helpString += "The trim parameter allows you to output a new fasta file containing your sequences with the chimeric ones trimmed to include only their longest piece, default=F. \n";
                 helpString += "The split parameter allows you to check both pieces of non-chimeric sequence for chimeras, thus looking for trimeras and quadmeras. default=F. \n";
                 helpString += "The window parameter allows you to specify the window size for searching for chimeras, default=50. \n";
@@ -595,6 +598,13 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "numwanted", false);                if (temp == "not found") { temp = "15"; }               
                         m->mothurConvert(temp, numwanted);
+            
+                       temp = validParameter.validFile(parameters, "dereplicate", false);      
+                       if (temp == "not found") { 
+                               if (groupfile != "")    {  temp = "false";                                      }
+                               else                    {  temp = "true";       }
+                       }
+                       dups = m->isTrue(temp);
                         
                         blastlocation = validParameter.validFile(parameters, "blastlocation", false);   
                         if (blastlocation == "not found") { blastlocation = ""; }
@@ -749,8 +759,10 @@ int ChimeraSlayerCommand::execute(){
                                 
                                 if (pid == 0) {
  #endif
-                               totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName);
-                m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine();
+                    if (!dups) {
+                        totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName);
+                        m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine();
+                    }
  #ifdef USE_MPI 
                                 }
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait
diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h

index 6149ef5b7ded364733fa4480dc9386d2e7e92fb2..0c6ebe2de76c957f2a3f9c072b9127efb002e5fb 100644 (file)
--- a/chimeraslayercommand.h
+++ b/chimeraslayercommand.h
@@ -70,7 +70,7 @@ private:
         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
         #endif
  
-       bool abort, realign, trim, trimera, save, hasName, hasCount;
+       bool abort, realign, trim, trimera, save, hasName, hasCount, dups;
         string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
         int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
         float divR;
diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp

index 04e1f1ddf244057e462d51c39c2913900d6424c3..ce7ce456d87c5c8c6e5a137cd661b02710e0977c 100644 (file)
--- a/chimerauchimecommand.cpp
+++ b/chimerauchimecommand.cpp
@@ -556,11 +556,11 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
                         temp = validParameter.validFile(parameters, "skipgaps2", false);                                if (temp == "not found") { temp = "t"; }
                         skipgaps2 = m->isTrue(temp); 
              
-            string usedDups = "false";
+            
                         temp = validParameter.validFile(parameters, "dereplicate", false);      
                         if (temp == "not found") { 
                                 if (groupfile != "")    {  temp = "false";                                      }
-                               else                    {  temp = "true"; usedDups = "";        }
+                               else                    {  temp = "true";       }
                         }
                         dups = m->isTrue(temp);
  
diff --git a/clustercommand.cpp b/clustercommand.cpp

index 93e29fd6e4daad8f09c6d373ba3b57e00ea55b52..e4e20622cae08b3578e8e8871616c1cd874da0aa 100644 (file)
--- a/clustercommand.cpp
+++ b/clustercommand.cpp
@@ -336,10 +336,10 @@ int ClusterCommand::execute(){
                 
          map<string, string> variables; 
          variables["[filename]"] = fileroot;
-        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          variables["[clustertag]"] = tag;
          string sabundFileName = getOutputFileName("sabund", variables);
          string rabundFileName = getOutputFileName("rabund", variables);
+        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          string listFileName = getOutputFileName("list", variables);
          
          if (countfile == "") {
diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp

index 4463896cefd13f6bafcc625846fd94213b610ba7..91eb923ae38d8b9a52ebb8a00a4ea4fb0cf951a7 100644 (file)
--- a/clusterdoturcommand.cpp
+++ b/clusterdoturcommand.cpp
@@ -244,10 +244,10 @@ int ClusterDoturCommand::execute(){
                         
          map<string, string> variables; 
          variables["[filename]"] = fileroot;
-        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          variables["[clustertag]"] = tag;
          string sabundFileName = getOutputFileName("sabund", variables);
          string rabundFileName = getOutputFileName("rabund", variables);
+        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          string listFileName = getOutputFileName("list", variables);
          
          if (countfile == "") {
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp

index ac5e75536fa44ff36341795b3cbecf7226061c6f..6d3c86ff1fa2efb62d46618f372706fd117c002a 100644 (file)
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -818,10 +818,10 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                 
          map<string, string> variables; 
          variables["[filename]"] = fileroot;
-        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          variables["[clustertag]"] = tag;
          string sabundFileName = getOutputFileName("sabund", variables);
          string rabundFileName = getOutputFileName("rabund", variables);
+        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          string listFileName = getOutputFileName("list", variables);
          
          if (countfile == "") {
diff --git a/filtersharedcommand.cpp b/filtersharedcommand.cpp

index 2899121917875a7aea63e0131b95c4aa19a13333..413a9a432532b12987a8fa8eb65948692cec9b22 100644 (file)
--- a/filtersharedcommand.cpp
+++ b/filtersharedcommand.cpp
@@ -17,6 +17,8 @@ vector<string> FilterSharedCommand::setParameters(){
                 CommandParameter pminpercent("minpercent", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercent);
          CommandParameter pminabund("minabund", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminabund);
          CommandParameter pmintotal("mintotal", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pmintotal);
+        CommandParameter pminnumsamples("minnumsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminnumsamples);
+        CommandParameter pminpercentsamples("minpercentsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercentsamples);
                 CommandParameter pmakerare("makerare", "Boolean", "", "T", "", "", "","",false,false,true); parameters.push_back(pmakerare);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
@@ -35,12 +37,14 @@ string FilterSharedCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The filter.shared command is used to remove OTUs based on various critieria.\n";
-               helpString += "The filter.shared command parameters are shared, minpercent, minabund, mintotal, makerare, groups and label.  You must provide a shared file.\n";
+               helpString += "The filter.shared command parameters are shared, minpercent, minabund, mintotal, minnumsamples, minpercentsamples, makerare, groups and label.  You must provide a shared file.\n";
                 helpString += "The groups parameter allows you to specify which of the groups you would like included. The group names are separated by dashes.\n";
                 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
          
                 helpString += "The minabund parameter allows you indicate the minimum abundance required for each sample in a given OTU.  If any samples abundance falls below the minimum, the OTU is removed. Default=0\n";
          helpString += "The minpercent parameter allows you indicate the minimum relative abundance of an OTU. For example, if the OTUs total abundance across all samples is 8, and the total abundance across all OTUs is 1000, and minpercent=1. The OTU's relative abundance is 0.008, the minimum is 0.01, so the OTU will be removed. Default=0.\n";
+        helpString += "The minnumsamples parameter allows you indicate the minimum number of samples present in an OTU. If the number of samples present falls below the minimum, the OTU is removed. Default=0.\n";
+        helpString += "The minpercentsamples parameter allows you indicate the minimum percent of sample present in an OTU. For example, if the total number of samples is 10, the number present is 3, and the minpercentsamples=50. The OTU's precent of samples is 0.333, the minimum is 0.50, so the OTU will be removed. Default=0.\n";
          helpString += "The mintotal parameter allows you indicate the minimum abundance required for a given OTU.  If abundance across all samples falls below the minimum, the OTU is removed. Default=0.\n";
          
                 helpString += "The makerare parameter allows you indicate you want the abundances of any removed OTUs to be saved and a new \"rare\" OTU created with its abundances equal to the sum of the OTUs removed.  This will preserve the number of reads in your dataset. Default=T\n";
@@ -166,6 +170,11 @@ FilterSharedCommand::FilterSharedCommand(string option) {
              else { setSomething = true; }
                         m->mothurConvert(temp, minTotal);
              
+            temp = validParameter.validFile(parameters, "minnumsamples", false);               
+            if (temp == "not found"){  temp = "-1";            }
+            else { setSomething = true; }
+                       m->mothurConvert(temp, minSamples);
+            
              temp = validParameter.validFile(parameters, "minpercent", false);          
              if (temp == "not found"){  temp = "-1";            }
              else { setSomething = true; }
@@ -173,6 +182,14 @@ FilterSharedCommand::FilterSharedCommand(string option) {
                         m->mothurConvert(temp, minPercent);
              if (minPercent < 1) {} //already in percent form
              else {  minPercent = minPercent / 100.0; } //user gave us a whole number version so convert to %
+            
+            temp = validParameter.validFile(parameters, "minpercentsamples", false);           
+            if (temp == "not found"){  temp = "-1";            }
+            else { setSomething = true; }
+            
+                       m->mothurConvert(temp, minPercentSamples);
+            if (minPercentSamples < 1) {} //already in percent form
+            else {  minPercentSamples = minPercentSamples / 100.0; } //user gave us a whole number version so convert to %
                         
                         temp = validParameter.validFile(parameters, "makerare", false);         if (temp == "not found"){       temp = "T";             }
                         makeRare = m->isTrue(temp);
@@ -347,6 +364,25 @@ int FilterSharedCommand::processShared(vector<SharedRAbundVector*>& thislookup)
                  if (percent < minPercent) { okay = false; }
              }
              
+            
+            if (okay && (minSamples != -1)) {
+                int samples = 0;
+                for (int j = 0; j < thislookup.size(); j++) { 
+                    if (thislookup[j]->getAbundance(i) != 0) { samples++; }
+                }
+                if (samples < minSamples) { okay = false; }
+            }
+            
+            if (okay && (minPercentSamples != -0.01)) {
+                double samples = 0;
+                double total = thislookup.size();
+                for (int j = 0; j < thislookup.size(); j++) { 
+                    if (thislookup[j]->getAbundance(i) != 0) { samples++; }
+                }
+                double percent = samples / total; 
+                if (percent < minPercentSamples) { okay = false; }
+            }
+            
              //did this OTU pass the filter criteria
              if (okay) {
                  filteredLabels.push_back(saveBinLabels[i]);
diff --git a/filtersharedcommand.h b/filtersharedcommand.h

index f97cbf94b374818da1089b9aa4fc596f1603b6d6..a97f96e05d4e4c8e7b67ee674a4001c70017ae05 100644 (file)
--- a/filtersharedcommand.h
+++ b/filtersharedcommand.h
@@ -38,8 +38,8 @@ private:
         set<string> labels; //holds labels to be used
         string groups, label, outputDir, sharedfile;
         vector<string> Groups, outputNames;
-       int minAbund, minTotal;
-    float minPercent;
+       int minAbund, minTotal, minSamples;
+    float minPercent, minPercentSamples;
      
      int processShared(vector<SharedRAbundVector*>&);
         
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp

index 0a2f91f4d08241547b2e4653a3d09841486dc42c..24c4f27777d21ed88b6f3b41411394502802404e 100644 (file)
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -271,10 +271,10 @@ int MGClusterCommand::execute(){
                 
          map<string, string> variables; 
          variables["[filename]"] = fileroot;
-        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          variables["[clustertag]"] = tag;
          string sabundFileName = getOutputFileName("sabund", variables);
          string rabundFileName = getOutputFileName("rabund", variables);
+        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
          string listFileName = getOutputFileName("list", variables);
          
          if (countfile == "") {
diff --git a/parselistscommand.cpp b/parselistscommand.cpp

index 61cb2454b52ee9e05cf1027995906371b5463762..3ed65c67f700b98d3c99a12a9dc6018ff4cfdd76 100644 (file)
--- a/parselistscommand.cpp
+++ b/parselistscommand.cpp
@@ -13,7 +13,8 @@
  vector<string> ParseListCommand::setParameters(){      
         try {
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(plist);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pgroup);
+        CommandParameter pcount("count", "InputTypes", "", "", "CountGroup", "CountGroup", "none","",false,false,true); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none","",false,false,true); parameters.push_back(pgroup);
                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
@@ -31,9 +32,11 @@ vector<string> ParseListCommand::setParameters(){
  string ParseListCommand::getHelpString(){      
         try {
                 string helpString = "";
-               helpString += "The parse.list command reads a list and group file and generates a list file for each group in the groupfile. \n";
-               helpString += "The parse.list command parameters are list, group and label.\n";
-               helpString += "The list and group parameters are required.\n";
+               helpString += "The parse.list command reads a list and group or count file and generates a list file for each group in the group or count file. \n";
+               helpString += "The parse.list command parameters are list, group, count and label.\n";
+               helpString += "The list and group or count parameters are required.\n";
+        helpString += "If a count file is provided, mothur assumes the list file contains only unique names.\n";
+        helpString += "If a group file is provided, mothur assumes the list file contains all names.\n";
                 helpString += "The label parameter is used to read specific labels in your input you want to use.\n";
                 helpString += "The parse.list command should be used in the following format: parse.list(list=yourListFile, group=yourGroupFile, label=yourLabels).\n";
                 helpString += "Example: parse.list(list=abrecovery.fn.list, group=abrecovery.groups, label=0.03).\n";
@@ -121,11 +124,18 @@ ParseListCommand::ParseListCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
-                       //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       
  
                         //check for required parameters
                         listfile = validParameter.validFile(parameters, "list", true);
@@ -139,28 +149,39 @@ ParseListCommand::ParseListCommand(string option)  {
                                                 
                                 }
                         }else { m->setListFile(listfile); }     
+            
+            //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(listfile);       }
                         
-                       groupfile = validParameter.validFile(parameters, "group", true);
-                       if (groupfile == "not open") { abort = true; }  
-                       else if (groupfile == "not found") { 
-                               groupfile = m->getListFile(); 
-                               if (groupfile != "") {  
-                                       m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); 
-                                       groupMap = new GroupMap(groupfile);
-                                       
-                                       int error = groupMap->readMap();
-                                       if (error == 1) { abort = true; }
-                               }else { m->mothurOut("No valid current group file. You must provide a group file."); m->mothurOutEndLine();  abort = true; } 
-                       }else {  
-                               m->setGroupFile(groupfile);
+            groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not found") { groupfile =  "";   groupMap = NULL; }
+                       else if (groupfile == "not open") { abort = true; groupfile =  ""; groupMap = NULL; }   
+                       else {   
+                m->setGroupFile(groupfile);
                                 groupMap = new GroupMap(groupfile);
                                 
                                 int error = groupMap->readMap();
                                 if (error == 1) { abort = true; }
-                       }
-                       
-                       //do you have all files needed
-                       if ((listfile == "") || (groupfile == "")) { m->mothurOut("You must enter both a listfile and groupfile for the parse.list command. "); m->mothurOutEndLine(); abort = true;  }
+            }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not found") { countfile =  "";   }
+                       else if (countfile == "not open") { abort = true; countfile =  ""; }    
+                       else {   
+                m->setCountTableFile(countfile); 
+                ct.readTable(countfile);
+                if (!ct.hasGroupInfo()) { 
+                    abort = true;
+                    m->mothurOut("[ERROR]: The parse.list command requires group info to be present in your countfile, quitting."); m->mothurOutEndLine();
+                }
+                    
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }else if ((groupfile == "") && (countfile == "")) {
+                m->mothurOut("[ERROR]: you must provide one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
@@ -191,7 +212,10 @@ int ParseListCommand::execute(){
                 //fill filehandles with neccessary ofstreams
                 int i;
                 ofstream* temp;
-               vector<string> gGroups = groupMap->getNamesOfGroups();
+               vector<string> gGroups;
+        if (groupfile != "") { gGroups = groupMap->getNamesOfGroups(); }
+        else { gGroups = ct.getNamesOfGroups(); }
+        
                 for (i=0; i<gGroups.size(); i++) {
                         temp = new ofstream;
                         filehandles[gGroups[i]] = temp;
@@ -206,13 +230,12 @@ int ParseListCommand::execute(){
                 set<string> processedLabels;
                 set<string> userLabels = labels;        
         
-               input = new InputData(listfile, "list");
-               list = input->getListVector();
+               InputData input(listfile, "list");
+               list = input.getListVector();
                 string lastLabel = list->getLabel();
                 
                 if (m->control_pressed) { 
-                       delete input; delete list; delete groupMap;
-                       vector<string> gGroups = groupMap->getNamesOfGroups();
+                       delete list; if (groupfile != "") { delete groupMap; }
                         for (i=0; i<gGroups.size(); i++) {  (*(filehandles[gGroups[i]])).close();  delete filehandles[gGroups[i]]; } 
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                         return 0;
@@ -221,7 +244,7 @@ int ParseListCommand::execute(){
                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                 
                         if (m->control_pressed) { 
-                               delete input; delete list; delete groupMap;
+                               delete list; if (groupfile != "") { delete groupMap; }
                                 for (i=0; i<gGroups.size(); i++) {  (*(filehandles[gGroups[i]])).close();  delete filehandles[gGroups[i]]; } 
                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                                 return 0;
@@ -239,8 +262,7 @@ int ParseListCommand::execute(){
                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
                                         string saveLabel = list->getLabel();
                                         
-                                       delete list;
-                                       list = input->getListVector(lastLabel); //get new list vector to process
+                                       list = input.getListVector(lastLabel); //get new list vector to process
                                         
                                         m->mothurOut(list->getLabel()); m->mothurOutEndLine();
                                         parse(list);
@@ -256,11 +278,11 @@ int ParseListCommand::execute(){
                         lastLabel = list->getLabel();
                                 
                         delete list;
-                       list = input->getListVector(); //get new list vector to process
+                       list = input.getListVector(); //get new list vector to process
                 }
                 
                 if (m->control_pressed) { 
-                       delete input; delete groupMap;
+                       if (groupfile != "") { delete groupMap; }
                         for (i=0; i<gGroups.size(); i++) {  (*(filehandles[gGroups[i]])).close();  delete filehandles[gGroups[i]]; } 
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                         return 0;
@@ -281,7 +303,7 @@ int ParseListCommand::execute(){
                 }
                 
                 if (m->control_pressed) { 
-                       delete input; delete groupMap;
+                       if (groupfile != "") { delete groupMap; }
                         for (i=0; i<gGroups.size(); i++) {  (*(filehandles[gGroups[i]])).close();  delete filehandles[gGroups[i]]; } 
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                         return 0;
@@ -290,7 +312,7 @@ int ParseListCommand::execute(){
                 //run last label if you need to
                 if (needToRun == true)  {
                         if (list != NULL) {     delete list;    }
-                       list = input->getListVector(lastLabel); //get new list vector to process
+                       list = input.getListVector(lastLabel); //get new list vector to process
                         
                         m->mothurOut(list->getLabel()); m->mothurOutEndLine();
                         parse(list);            
@@ -303,9 +325,7 @@ int ParseListCommand::execute(){
                         delete it3->second;
                 }
                 
-               
-               delete groupMap;
-               delete input;
+               if (groupfile != "") { delete groupMap; }
                 
                 if (m->control_pressed) { 
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
@@ -357,17 +377,33 @@ int ParseListCommand::parse(ListVector* thisList) {
                         
                         //parse bin into list of sequences in each group
                         for (int j = 0; j < names.size(); j++) {
-                               string group = groupMap->getGroup(names[j]);
+                if (groupfile != "") {
+                    string group = groupMap->getGroup(names[j]);
                                 
-                               if (group == "not found") { m->mothurOut(names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); exit(1); }
+                    if (group == "not found") { m->mothurOut(names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); exit(1); }
                                 
-                               itGroup = groupBins.find(group);
-                               if(itGroup == groupBins.end()) {
-                                       groupBins[group] = names[j];  //add first name
-                                       groupNumBins[group]++;
-                               }else{ //add another name
-                                       groupBins[group] = groupBins[group] + "," + names[j];
-                               }
+                    itGroup = groupBins.find(group);
+                    if(itGroup == groupBins.end()) {
+                        groupBins[group] = names[j];  //add first name
+                        groupNumBins[group]++;
+                    }else{ //add another name
+                        groupBins[group] = groupBins[group] + "," + names[j];
+                    }
+                }else{
+                    vector<string> thisSeqsGroups = ct.getGroups(names[j]);
+                    
+                    for (int k = 0; k < thisSeqsGroups.size(); k++) {
+                        string group = thisSeqsGroups[k];
+                        itGroup = groupBins.find(group);
+                        if(itGroup == groupBins.end()) {
+                            groupBins[group] = names[j];  //add first name
+                            groupNumBins[group]++;
+                        }else{ //add another name
+                            groupBins[group] = groupBins[group] + "," + names[j];
+                        }
+
+                    }
+                }
                         }
                         
                         //print parsed bin info to files
diff --git a/parselistscommand.h b/parselistscommand.h

index b366d28a6a84a7558cda6547497130ef0c09cbdc..84abc85d46e0bc3fddd916a0095c4b45aa44dbd1 100644 (file)
--- a/parselistscommand.h
+++ b/parselistscommand.h
@@ -40,10 +40,10 @@ private:
                 
         ListVector* list;
         GroupMap* groupMap;
-       InputData* input;
+    CountTable ct;
         
         ofstream out;
-       string outputDir, listfile, groupfile, label;
+       string outputDir, listfile, groupfile, label, countfile;
         set<string> labels;
         bool abort, allLines;
         vector<string> outputNames;
diff --git a/phylotree.cpp b/phylotree.cpp

index 8a7c712b0f255db20da492fd95bc15d3116ce151..b9bab4ec2934a4121305f87111cf2d238f5ec4d5 100644 (file)
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -259,6 +259,8 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                         //somehow the parent is getting one too many accnos
                         //use print to reassign the taxa id
                         taxon = getNextTaxon(seqTaxonomy, seqName);
+            
+            if (m->debug) { m->mothurOut(seqName +'\t' + taxon +'\n'); }
                         
                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies.insert(currentNode); } break;  }
                         
@@ -341,6 +343,9 @@ void PhyloTree::assignHeirarchyIDs(int index){
                 int counter = 1;
                 
                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+            
+            if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); }
+                
                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
                         counter++;
                         tree[it->second].level = tree[index].level + 1;
@@ -399,6 +404,8 @@ void PhyloTree::binUnclassified(string file){
                         }
                 }
                 
+        if (m->debug) { m->mothurOut("maxLevel = " + toString(maxLevel) +'\n'); }
+        
                 int copyNodes = copy.size();
         
                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
@@ -409,11 +416,14 @@ void PhyloTree::binUnclassified(string file){
                         
                         int level = copy[itLeaf->second].level;
                         int currentNode = itLeaf->second;
+            
+            if (m->debug) { m->mothurOut(copy[currentNode].name +'\n'); }
                         
                         //this sequence is unclassified at some levels
                         while(level < maxLevel){
                 
                                 level++;
+                if (m->debug) { m->mothurOut("level = " + toString(level) +'\n'); }
                         
                                 string taxon = "unclassified";  
                                 
diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp

index dfb89b927c2de8dd34dcda801fae80d93281cc63..f61232a26c133fb9d0cebfa9cdcca31fdfe2f0dc 100644 (file)
--- a/sensspeccommand.cpp
+++ b/sensspeccommand.cpp
@@ -211,14 +211,25 @@ SensSpecCommand::SensSpecCommand(string option)  {
  int SensSpecCommand::execute(){
         try{
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
-
+        
+        int startTime = time(NULL);
+        
+        //create list file with only unique names, saves time and memory by removing redundant names from list file that are not in the distance file.
+        string newListFile = preProcessList();
+        if (newListFile != "") { listFile = newListFile; }
+        
                 setUpOutput();
                 outputNames.push_back(sensSpecFileName); outputTypes["sensspec"].push_back(sensSpecFileName);
                 if(format == "phylip")          {       processPhylip();        }
                 else if(format == "column")     {       processColumn();        }
                 
+        //remove temp file if created
+        if (newListFile != "") { m->mothurRemove(newListFile); }
+        
                 if (m->control_pressed) { m->mothurRemove(sensSpecFileName); return 0; }
-               
+        
+        m->mothurOut("It took " + toString(time(NULL) - startTime) + " to run sens.spec."); m->mothurOutEndLine();
+        
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                 m->mothurOut(sensSpecFileName); m->mothurOutEndLine();  
@@ -684,6 +695,122 @@ void SensSpecCommand::outputStatistics(string label, string cutoff){
                 exit(1);
         }
  }
+//***************************************************************************************************************
+
+string SensSpecCommand::preProcessList(){
+    try {
+        set<string> uniqueNames;
+        //get unique names from distance file
+        if (format == "phylip") {
+            
+            ifstream phylipFile;
+            m->openInputFile(distFile, phylipFile);
+            string numTest;
+            int pNumSeqs;
+                       phylipFile >> numTest;
+                       
+                       if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+            else {
+                m->mothurConvert(numTest, pNumSeqs);
+            }
+            phylipFile >> pNumSeqs; m->gobble(phylipFile);
+            
+            string seqName;
+            double distance;
+            
+            for(int i=0;i<pNumSeqs;i++){
+                
+                if (m->control_pressed) { return ""; }
+                
+                phylipFile >> seqName; 
+                uniqueNames.insert(seqName);
+                
+                for(int j=0;j<i;j++){
+                    phylipFile >> distance;
+                }
+                m->gobble(phylipFile);
+            }
+            phylipFile.close();
+        }else {
+            ifstream columnFile;
+            m->openInputFile(distFile, columnFile);
+            string seqNameA, seqNameB;
+            double distance;
+            
+            while(columnFile){
+                if (m->control_pressed) { return ""; }
+                columnFile >> seqNameA >> seqNameB >> distance;
+                uniqueNames.insert(seqNameA); uniqueNames.insert(seqNameB);
+                m->gobble(columnFile);
+            }
+            columnFile.close();
+        }
+        
+        //read list file, if numSeqs > unique names then remove redundant names
+        string newListFile = listFile + ".temp";
+        ofstream out;
+        m->openOutputFile(newListFile, out);
+        ifstream in;
+               m->openInputFile(listFile, in);
+               
+               bool wroteSomething = false;
+               
+               while(!in.eof()){
+                       
+                       if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(newListFile);  return ""; }
+            
+                       //read in list vector
+                       ListVector list(in);
+            
+            //listfile is already unique
+            if (list.getNumSeqs() == uniqueNames.size()) { in.close(); out.close(); m->mothurRemove(newListFile);  return ""; }
+                       
+                       //make a new list vector
+                       ListVector newList;
+                       newList.setLabel(list.getLabel());
+                       
+                       //for each bin
+                       for (int i = 0; i < list.getNumBins(); i++) {
+                
+                               //parse out names that are in accnos file
+                               string binnames = list.get(i);
+                vector<string> bnames;
+                m->splitAtComma(binnames, bnames);
+                               
+                               string newNames = "";
+                for (int i = 0; i < bnames.size(); i++) {
+                                       string name = bnames[i];
+                                       //if that name is in the .accnos file, add it
+                                       if (uniqueNames.count(name) != 0) {  newNames += name + ",";  }
+                               }
+                
+                               //if there are names in this bin add to new list
+                               if (newNames != "") { 
+                                       newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
+                                       newList.push_back(newNames);    
+                               }
+                       }
+            
+                       //print new listvector
+                       if (newList.getNumBins() != 0) {
+                               wroteSomething = true;
+                               newList.print(out);
+                       }
+                       
+                       m->gobble(in);
+               }
+               in.close();     
+               out.close();
+
+        if (wroteSomething) { return newListFile; }
+        return ""; 
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SensSpecCommand", "preProcessList");
+        exit(1);
+    }
+}
+
  
  //***************************************************************************************************************
  
diff --git a/sensspeccommand.h b/sensspeccommand.h

index 73b74e3983a9b1e4078a5ee55e5ae975fd40f944..7d85a39bea25ced837551589cf21e2c431c8d7ca 100644 (file)
--- a/sensspeccommand.h
+++ b/sensspeccommand.h
@@ -58,6 +58,7 @@ private:
         int fillSeqPairSet(set<string>&, ListVector*&);
         int process(map<string, int>&, string, bool&, string&);
         int process(set<string>&, string, bool&, string&, int);
+    string preProcessList();
  
  };
author	Sarah Westcott <mothur.westcott@gmail.com>
	Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)
committer	Sarah Westcott <mothur.westcott@gmail.com>
	Tue, 8 Jan 2013 17:19:41 +0000 (12:19 -0500)
chimeraperseuscommand.cpp		patch \| blob \| history
chimeraperseuscommand.h		patch \| blob \| history
chimeraslayercommand.cpp		patch \| blob \| history
chimeraslayercommand.h		patch \| blob \| history
chimerauchimecommand.cpp		patch \| blob \| history
clustercommand.cpp		patch \| blob \| history
clusterdoturcommand.cpp		patch \| blob \| history
clustersplitcommand.cpp		patch \| blob \| history
filtersharedcommand.cpp		patch \| blob \| history
filtersharedcommand.h		patch \| blob \| history
mgclustercommand.cpp		patch \| blob \| history
parselistscommand.cpp		patch \| blob \| history
parselistscommand.h		patch \| blob \| history
phylotree.cpp		patch \| blob \| history
sensspeccommand.cpp		patch \| blob \| history
sensspeccommand.h		patch \| blob \| history