finished work on classify.seqs bayesian method and various bug fixes

author westcott <westcott>

Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)

committer westcott <westcott>

Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)
author westcott <westcott>
Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)
committer westcott <westcott>
Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index 9b064e08786aa39aa48f0c32bfc218b2d4e9e62c..27a8b91bb18ccfd3f2e24856a0318268b0ceea2f 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -170,7 +170,6 @@
                 A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
                 A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; };
                 A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; };
-               A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; };
                 A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; };
                 A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
                 A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
@@ -546,8 +545,6 @@
                 A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = "<group>"; };
                 A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; };
                 A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; };
-               A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; };
-               A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; };
                 A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = "<group>"; };
                 A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = "<group>"; };
                 A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = "<group>"; };
@@ -1042,8 +1039,6 @@
                                 A78781B810A0813D0086103D /* doTaxonomy.cpp */,
                                 A787844310A1EBDD0086103D /* knn.h */,
                                 A787844410A1EBDD0086103D /* knn.cpp */,
-                               A787811310A074330086103D /* phylotype.h */,
-                               A787811410A074330086103D /* phylotype.cpp */,
                         );
                         name = classifiers;
                         sourceTree = "<group>";
@@ -1272,7 +1267,6 @@
                                 A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
                                 A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */,
                                 A787810310A06D5D0086103D /* classify.cpp in Sources */,
-                               A787811510A074330086103D /* phylotype.cpp in Sources */,
                                 A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */,
                                 A787821110A0AAE70086103D /* bayesian.cpp in Sources */,
                                 A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */,
diff --git a/bayesian.cpp b/bayesian.cpp

index caae212f4855367653fef715e64a551eb4d095e9..3a2d3e0156468f1114dd0ebb2d47fe87fe577789 100644 (file)
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -8,15 +8,116 @@
   */
  
  #include "bayesian.h"
+#include "kmer.hpp"
  
  /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : 
-Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff) : 
+Classify(tfile, tempFile, method, ksize, 0, 0, 0, 0), kmerSize(ksize), confidenceThreshold(cutoff)  {
+       try {
+                                       
+               numKmers = database->getMaxKmer() + 1;
+               
+               //initialze probabilities
+               wordGenusProb.resize(numKmers);
+               
+               genusNodes = phyloTree->getGenusNodes(); 
+               
+               for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
+                       
+               //reset counts because we are on a new word
+               for (int j = 0; j < genusNodes.size(); j++) {
+                       TaxNode temp = phyloTree->get(genusNodes[j]);
+                       genusTotals.push_back(temp.accessions.size());
+               }
+
+               
+               /************calculate the probablity that each word will be in a specific taxonomy*************/
+               ofstream out;
+               string probFileName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
+               ifstream probFileTest(probFileName.c_str());
+               
+               ofstream out2;
+               string probFileName2 = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+               ifstream probFileTest2(probFileName2.c_str());
+               
+               int start = time(NULL);
+               
+               if(probFileTest && probFileTest2){      
+                       mothurOut("Reading template probabilities...     "); cout.flush();
+                       readProbFile(probFileTest, probFileTest2);      
+               }else{
+                       mothurOut("Calculating template probabilities...     "); cout.flush();
+
+                       ofstream out;
+                       openOutputFile(probFileName, out);
+                       
+                       ofstream out2;
+                       openOutputFile(probFileName2, out2);
+                       
+                       //for each word
+                       for (int i = 0; i < numKmers; i++) {
+                               
+                               out << i << '\t';
+                               
+                               vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
+                               
+                               map<int, int> count;
+                               for (int k = 0; k < genusNodes.size(); k++) {  count[genusNodes[k]] = 0;  }                     
+                                               
+                               //for each sequence with that word
+                               for (int j = 0; j < seqsWithWordi.size(); j++) {
+                                       int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+                                       count[temp]++;  //increment count of seq in this genus who have this word
+                               }
+                               
+                               //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
+                               float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
+                               
+                               int numNotZero = 0;
+                               for (int k = 0; k < genusNodes.size(); k++) {
+                                       //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
+                                       wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
+                                       if (count[genusNodes[k]] != 0) {  out << k << '\t' << wordGenusProb[i][k] << '\t';  numNotZero++;  }
+                               }
+                               out << endl;
+                               out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+                       }
+                       
+                       out.close();
+                       out2.close();
+               }
+               
+               
+               mothurOut("DONE."); mothurOutEndLine();
+               mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); mothurOutEndLine();
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "getTaxonomy");
+               exit(1);
+       }
+}
  /**************************************************************************************************/
  string Bayesian::getTaxonomy(Sequence* seq) {
         try {
                 string tax;
+               Kmer kmer(kmerSize);
                 
+               //get words contained in query
+               //getKmerString returns a string where the index in the string is hte kmer number 
+               //and the character at that index can be converted to be the number of times that kmer was seen
+               string queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
+               vector<int> queryKmers;
+               for (int i = 0; i < queryKmerString.length(); i++) {
+                       if (queryKmerString[i] != '!') { //this kmer is in the query
+                               queryKmers.push_back(i);
+                       }
+               }
+       
+               int index = getMostProbableTaxonomy(queryKmers);
+                                       
+               //bootstrap - to set confidenceScore
+               int numToSelect = queryKmers.size() / 8;
+               tax = bootstrapResults(queryKmers, index, numToSelect);
                                 
                 return tax;     
         }
@@ -26,4 +127,178 @@ string Bayesian::getTaxonomy(Sequence* seq) {
         }
  }
  /**************************************************************************************************/
+string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
+       try {
+               
+               //taxConfidenceScore.clear(); //clear out previous seqs scores
+                               
+               vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
+                                                                               //map of classification to confidence for all areas seen
+                                                                          //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
+                                                                          //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
+               confidenceScores.resize(100);  //if you have more than 100 levels of classification...
+               
+               map<string, int>::iterator itBoot;
+               map<string, int>::iterator itBoot2;
+               map<int, int>::iterator itConvert;
+               
+               for (int i = 0; i < 100; i++) {
+                       vector<int> temp;
+                                               
+                       for (int j = 0; j < numToSelect; j++) {
+                               int index = int(rand() % kmers.size());
+                               
+                               //add word to temp
+                               temp.push_back(kmers[index]);
+                       }
+                       
+                       //get taxonomy
+                       int newTax = getMostProbableTaxonomy(temp);
+                       TaxNode taxonomy = phyloTree->get(newTax);
+                       
+                       //add to confidence results
+                       while (taxonomy.level != 0) { //while you are not at the root
+                               
+                               itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+                               
+                               if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
+                                       confidenceScores[taxonomy.level][taxonomy.name] = 1;
+                               }else{
+                                       confidenceScores[taxonomy.level][taxonomy.name]++;
+                               }
+                       
+                               taxonomy = phyloTree->get(taxonomy.parent);
+                       }
+               }
+               
+               string confidenceTax = "";
+               simpleTax = "";
+               TaxNode seqTax = phyloTree->get(tax);
+               
+               while (seqTax.level != 0) { //while you are not at the root
+                               
+                               itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+                               
+                               int confidence = 0;
+                               if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
+                                       confidence = confidenceScores[seqTax.level][seqTax.name];
+                               }
+                               
+                               if (confidence >= confidenceThreshold) {
+                                       confidenceTax = seqTax.name + "(" + toString(confidence) + ");" + confidenceTax;
+                                       simpleTax = seqTax.name + ";" + simpleTax;
+                               }
+                               
+                               seqTax = phyloTree->get(seqTax.parent);
+               }
+               
+               return confidenceTax;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "bootstrapResults");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
+       try {
+               int indexofGenus;
+               
+               double maxProbability = -1000000.0;
+               //find taxonomy with highest probability that this sequence is from it
+               for (int k = 0; k < genusNodes.size(); k++) {
+               
+                       //for each taxonomy calc its probability
+                       double prob = 1.0;
+                       for (int i = 0; i < queryKmer.size(); i++) {
+                               prob += wordGenusProb[queryKmer[i]][k];
+                       }
+               
+                       //is this the taxonomy with the greatest probability?
+                       if (prob > maxProbability) { 
+                               indexofGenus = genusNodes[k];
+                               maxProbability = prob;
+                       }
+               }
+
+               return indexofGenus;
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "getMostProbableTaxonomy");
+               exit(1);
+       }
+}
+/*************************************************************************************************
+map<string, int> Bayesian::parseTaxMap(string newTax) {
+       try{
+       
+               map<string, int> parsed;
+               
+               newTax = newTax.substr(0, newTax.length()-1);  //get rid of last ';'
+       
+               //parse taxonomy
+               string individual;
+               while (newTax.find_first_of(';') != -1) {
+                       individual = newTax.substr(0,newTax.find_first_of(';'));
+                       newTax = newTax.substr(newTax.find_first_of(';')+1, newTax.length());
+                       parsed[individual] = 1;
+               }
+               
+               //get last one
+               parsed[newTax] = 1;
+
+               return parsed;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "parseTax");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+       try{
+               
+               int kmer, name, count;  count = 0;
+               vector<int> num; num.resize(numKmers);
+               float prob;
+               vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
+               
+               while (inNum) {
+                       inNum >> zeroCountProb[count] >> num[count];  
+                       count++;
+                       gobble(inNum);
+               }
+               inNum.close();
+               
+               while(in) {
+                       in >> kmer;
+                       
+                       //set them all to zero value
+                       for (int i = 0; i < genusNodes.size(); i++) {
+                               wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+                       }
+                       
+                       //get probs for nonzero values
+                       for (int i = 0; i < num[kmer]; i++) {
+                               in >> name >> prob;
+                               wordGenusProb[kmer][name] = prob;
+                       }
+                       
+                       gobble(in);
+               }
+               in.close();
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "readProbFile");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+
+
+
+
  
diff --git a/bayesian.h b/bayesian.h

index cbe949ff311edc776d507530fcc5b5a3f327b73e..9b93c839c9e5a4a749bf190c1bd95200fc82f272 100644 (file)
--- a/bayesian.h
+++ b/bayesian.h
@@ -18,13 +18,29 @@
  class Bayesian : public Classify {
         
  public:
-       Bayesian(string, string, string, int, int, int, int, int);
+       Bayesian(string, string, string, int, int);
         ~Bayesian() {};
         
         string getTaxonomy(Sequence*);
-       
+       //map<string, int> getConfidenceScores() { return taxConfidenceScore; }
         
  private:
+       //map<string, vector<double> > taxonomyProbability;     //probability of a word being in a given taxonomy. 
+                                                                                                       //taxonomyProbability[bacteria;][0] = probabtility that a sequence within bacteria; would contain kmer 0;
+                                                                                                       //taxonomyProbability[bacteria;][1] = probabtility that a sequence within bacteria; would contain kmer 1...
+       
+       vector< vector<float> > wordGenusProb;  //vector of maps from genus to probability
+                                                                                               //wordGenusProb[0][392] = probability that a sequence within genus that's index in the tree is 392 would contain kmer 0;
+       
+       vector<int> genusTotals;
+       vector<int> genusNodes;  //indexes in phyloTree where genus' are located
+       
+       int kmerSize, numKmers, confidenceThreshold;
+       
+       string bootstrapResults(vector<int>, int, int);
+       int getMostProbableTaxonomy(vector<int>);
+       void readProbFile(ifstream&, ifstream&);
+       //map<string, int> parseTaxMap(string);
         
  };
  
diff --git a/classify.cpp b/classify.cpp

index 605c1f8f3c6470310a649cebfcc090141eb7d70d..f6b7e80be51f9383c2dd6599f737c24fa36a0f6d 100644 (file)
--- a/classify.cpp
+++ b/classify.cpp
@@ -17,9 +17,9 @@
  
  Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) {         
         try {                                                                                   
+               readTaxonomy(taxFile);  
                 
-               readTaxonomy(taxFile);
-               
+               int start = time(NULL);
                 int numSeqs = 0;
                 //need to know number of template seqs for suffixdb
                 if (method == "suffix") {
@@ -51,12 +51,13 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
                 if (needToGenerate) {
                         ifstream fastaFile;
                         openInputFile(tempFile, fastaFile);
-               
+                       
                         while (!fastaFile.eof()) {
                                 Sequence temp(fastaFile);
                                 gobble(fastaFile);
                         
                                 names.push_back(temp.getName());
+                                                               
                                 database->addSequence(temp);    
                         }
                         fastaFile.close();
@@ -69,7 +70,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
                         
                         ifstream fastaFile;
                         openInputFile(tempFile, fastaFile);
-               
+                       
                         while (!fastaFile.eof()) {
                                 Sequence temp(fastaFile);
                                 gobble(fastaFile);
@@ -82,6 +83,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
                 database->setNumSeqs(names.size());
                 
                 mothurOut("DONE."); mothurOutEndLine();
+               mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); mothurOutEndLine();
  
         }
         catch(exception& e) {
@@ -93,7 +95,9 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
  
  void Classify::readTaxonomy(string file) {
         try {
-       
+               
+               phyloTree = new PhyloTree();
+               
                 ifstream inTax;
                 openInputFile(file, inTax);
         
@@ -106,9 +110,17 @@ void Classify::readTaxonomy(string file) {
                         inTax >> name >> taxInfo;
                         
                         taxonomy[name] = taxInfo;
+                       
+                       //itTax = taxList.find(taxInfo);
+                       //if (itTax == taxList.end()) { //this is new taxonomy
+                               //taxList[taxInfo] = 1;
+                       //}else { taxList[taxInfo]++;   }
+                       phyloTree->addSeqToTree(name, taxInfo);
                 
                         gobble(inTax);
                 }
+               
+               phyloTree->assignHeirarchyIDs(0);
                 inTax.close();
         
                 mothurOut("DONE.");
@@ -122,3 +134,29 @@ void Classify::readTaxonomy(string file) {
  }
  /**************************************************************************************************/
  
+vector<string> Classify::parseTax(string tax) {
+       try {
+               vector<string> taxons;
+               
+               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
+       
+               //parse taxonomy
+               string individual;
+               while (tax.find_first_of(';') != -1) {
+                       individual = tax.substr(0,tax.find_first_of(';'));
+                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+                       taxons.push_back(individual);
+                       
+               }
+               //get last one
+               taxons.push_back(tax);
+               
+               return taxons;
+       }
+       catch(exception& e) {
+               errorOut(e, "Classify", "parseTax");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/classify.h b/classify.h

index 2926800284ce00cbb7ceb2768bfbe35f28a44a80..03142353feb8af7b8488007e5367144602a45acf 100644 (file)
--- a/classify.h
+++ b/classify.h
@@ -15,33 +15,41 @@
  
  #include "mothur.h"
  #include "database.hpp"
-
+#include "doTaxonomy.h"
  
  
  class Sequence;
  
+
  /**************************************************************************************************/
  
  class Classify {
  
  public:
         Classify(string, string, string, int, int, int, int, int);
-       Classify(){};
+       Classify(){     delete phyloTree;  }
         
         virtual ~Classify(){};
         virtual string getTaxonomy(Sequence*) = 0;
+       //virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
+       //virtual vector<string> parseTax(string);
+       virtual string getSimpleTax()  { return simpleTax;      }
         
  protected:
  
         map<string, string> taxonomy;  //name maps to taxonomy
+       //map<string, int> genusCount;  //maps genus to count - in essence a list of how many seqs are in each taxonomy
+       map<string, int>::iterator itTax;
         map<string, string>::iterator it;
         Database* database;
+       PhyloTree* phyloTree;
         
-       string taxFile, templateFile;
+       string taxFile, templateFile, simpleTax;
         vector<string> names;
+       //map<string, int> taxConfidenceScore;
         
         void readTaxonomy(string);
-               
+       vector<string> parseTax(string);
  };
  
  /**************************************************************************************************/
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp

index 69ef7127182001dc7d2b292650b3be38e0445a1d..25642931164bd09c3e1cd91b0e6a9bd4009ff184 100644 (file)
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -9,7 +9,6 @@
  
  #include "classifyseqscommand.h"
  #include "sequence.hpp"
-#include "phylotype.h"
  #include "bayesian.h"
  #include "doTaxonomy.h"
  #include "knn.h"
@@ -27,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                 else {
                         
                         //valid paramters for this command
-                       string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+                       string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -77,7 +76,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                         
                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
                         
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "phylotype";   }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
                         
                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
                         convert(temp, match);  
@@ -93,7 +92,15 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                         
                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
                         convert(temp, numWanted);
+                       
+                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
+                       convert(temp, cutoff);
  
+                       
+                       if ((method == "bayesian") && (search != "kmer"))  { 
+                               mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
+                               search = "kmer";
+                       }
                 }
                 
         }
@@ -120,7 +127,7 @@ void ClassifySeqsCommand::help(){
                 mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
                 mothurOut("The template, fasta and taxonomy parameters are required.\n");
                 mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
-               mothurOut("The method parameter allows you to specify classification method to use.  Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+               mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
                 mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
                 mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
                 mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
@@ -128,9 +135,10 @@ void ClassifySeqsCommand::help(){
                 mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
                 mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
                 mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
+               mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
                 mothurOut("The classify.seqs command should be in the following format: \n");
                 mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
-               mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+               mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
                 mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
                 mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
@@ -148,22 +156,23 @@ int ClassifySeqsCommand::execute(){
         try {
                 if (abort == true) {    return 0;       }
                 
-               int start = time(NULL);
                 
-               if(method == "phylotype"){      classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);                                    }
-               //else if(method == "bayesian")         {       classify = new Bayesian(taxonomyFileName, templateFileName, search);            }
+               
+               if(method == "bayesian")                        {       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);          }
                 else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
                 else {
-                       mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+                       mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
                         mothurOutEndLine();
-                       classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+                       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);  
                 }
  
                 int numFastaSeqs = 0;
                 
                 string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+               string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
                 string taxSummary = getRootName(fastaFileName) + "tax.summary";
                 
+               int start = time(NULL);
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 if(processors == 1){
                         ifstream inFASTA;
@@ -173,7 +182,7 @@ int ClassifySeqsCommand::execute(){
                         
                         lines.push_back(new linePair(0, numFastaSeqs));
                         
-                       driver(lines[0], newTaxonomyFile);
+                       driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
                         
                 }
                 else{
@@ -203,13 +212,16 @@ int ClassifySeqsCommand::execute(){
                                 }
                                 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
                         }
-                       createProcesses(newTaxonomyFile); 
+                       createProcesses(newTaxonomyFile, tempTaxonomyFile); 
                         
                         rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+                       rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
                         
                         for(int i=1;i<processors;i++){
                                 appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+                               appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
                                 remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+                               remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
                         }
                         
                 }
@@ -221,14 +233,14 @@ int ClassifySeqsCommand::execute(){
                 
                 lines.push_back(new linePair(0, numFastaSeqs));
                 
-               driver(lines[0], newTaxonomyFile);
+               driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
  #endif
                 
                 delete classify;
                 
                 //make taxonomy tree from new taxonomy file 
                 ifstream inTaxonomy;
-               openInputFile(newTaxonomyFile, inTaxonomy);
+               openInputFile(tempTaxonomyFile, inTaxonomy);
                 
                 string accession, taxaList;
                 PhyloTree taxaBrowser;
@@ -242,6 +254,7 @@ int ClassifySeqsCommand::execute(){
                         gobble(inTaxonomy);
                 }
                 inTaxonomy.close();
+               remove(tempTaxonomyFile.c_str());
                 
                 taxaBrowser.assignHeirarchyIDs(0);
                 
@@ -264,7 +277,7 @@ int ClassifySeqsCommand::execute(){
  }
  /**************************************************************************************************/
  
-void ClassifySeqsCommand::createProcesses(string taxFileName) {
+void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 int process = 0;
@@ -278,7 +291,7 @@ void ClassifySeqsCommand::createProcesses(string taxFileName) {
                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                 process++;
                         }else if (pid == 0){
-                               driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+                               driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
                                 exit(0);
                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
                 }
@@ -321,10 +334,13 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
  
  //**********************************************************************************************************************
  
-int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
         try {
                 ofstream outTax;
                 openOutputFile(taxFName, outTax);
+               
+               ofstream outTaxSimple;
+               openOutputFile(tempTFName, outTaxSimple);
         
                 ifstream inFASTA;
                 openInputFile(fastaFileName, inFASTA);
@@ -340,14 +356,33 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName){
                         taxonomy = classify->getTaxonomy(candidateSeq);
                         
                         if (taxonomy != "bad seq") {
-                               outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                               //if (method != "bayesian") {
+                                       outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                                       outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+                               //}else{
+                                       //vector<string> pTax = classify->parseTax(taxonomy);
+                                       //map<string, int> confidence = classify->getConfidenceScores();
+                                       
+                                       //outTax << candidateSeq->getName() << '\t';
+                                       //for (int j = 0; j < pTax.size(); j++) {
+                                               //if (confidence[pTax[j]] > cutoff) {
+                                               //      outTax << pTax[j] << "(" << confidence[pTax[j]] << ");";
+                                               //}else{ break; }
+                                       //}
+                                       //outTax << endl;
+                               //}
                         }
                                                         
                         delete candidateSeq;
+                       
+                       if(i % 100 == 0){
+                               mothurOut("Classifying sequence " + toString(i)); mothurOutEndLine();
+                       }
                 }
                 
                 inFASTA.close();
                 outTax.close();
+               outTaxSimple.close();
                 
                 return 1;
         }
diff --git a/classifyseqscommand.h b/classifyseqscommand.h

index a77b7d1d7193b862c88e036553a7616fb3f33dc9..a73aa0c576f1dc6a3437d9c37557aa89cc0b451c 100644 (file)
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -35,13 +35,13 @@ private:
         Classify* classify;
         
         string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
-       int processors, kmerSize, numWanted;
+       int processors, kmerSize, numWanted, cutoff;
         float match, misMatch, gapOpen, gapExtend;
         bool abort;
         
-       int driver(linePair*, string);
+       int driver(linePair*, string, string);
         void appendTaxFiles(string, string);
-       void createProcesses(string); 
+       void createProcesses(string, string); 
  };
  
  #endif
diff --git a/collect.cpp b/collect.cpp

index 77d279d57ed4ac4d0078d20e736a874056d3b132..c270ae1f525606d0b6f37e7eb1ffacecd2a0f47c 100644 (file)
--- a/collect.cpp
+++ b/collect.cpp
@@ -88,7 +88,7 @@ try {
                                 
                  for(int i=0;i<displays.size();i++){
                          ccd->registerDisplay(displays[i]); //adds a display[i] to cdd
-                                               if (displays[i]->isCalcMultiple() == true)  {   displays[i]->init(groupLabelAll); }
+                                               if ((displays[i]->isCalcMultiple() == true) && (displays[i]->getAll() == true)) {   displays[i]->init(groupLabelAll); }
                                                 else {  displays[i]->init(groupLabel);  }           
                  }
                  
diff --git a/collectdisplay.h b/collectdisplay.h

index 0f18da6277853059464b1902aeb81b92a3facae1..15cc5a75a1f32edac6d234ea934ceffdad608301 100644 (file)
--- a/collectdisplay.h
+++ b/collectdisplay.h
@@ -15,6 +15,8 @@ class CollectDisplay : public Display {
  public:
         CollectDisplay(Calculator* calc, FileOutput* file) : estimate(calc), output(file) {timesCalled = 0;};
         ~CollectDisplay()       {       delete estimate; delete output;         }
+       
+       
         void update(SAbundVector* rank){
                 nSeqs=rank->getNumSeqs();
                 data = estimate->getValues(rank);
@@ -45,7 +47,7 @@ public:
                         n++;
                 }
                 
-               if (estimate->getMultiple() == true) { 
+               if ((estimate->getMultiple() == true) && all) { 
                         numGroupComb++; 
                         groupData.resize((numGroupComb*data.size()), 0);
                         //is this the time its called with all values
@@ -74,10 +76,14 @@ public:
         void init(string s)             {       output->initFile(s);    };
         void reset()                    {       output->resetFile();    };
         void close()                    {       output->resetFile();    };
+       void setAll(bool a)             {       all = a;                                }
+       bool getAll()                   {       return all;                             }
+       
         bool isCalcMultiple() { return estimate->getMultiple(); }
         
         string getName()        {  return estimate->getName();  }
-
+       
+       
  private:
         
         Calculator* estimate;
@@ -85,6 +91,8 @@ private:
         int nSeqs, timesCalled, numGroupComb;
         vector<double> data;
         vector<double> groupData;
+       bool all;
+       
  };
  
  /***********************************************************************/
diff --git a/collectorscurvedata.h b/collectorscurvedata.h

index d90e84acdf0989e53b5caf18f3a6a7a202bea28e..9c70e7aef16f389c2ea8e8ac39c98cdd9cc03f49 100644 (file)
--- a/collectorscurvedata.h
+++ b/collectorscurvedata.h
@@ -48,7 +48,7 @@ public:
         void notifyDisplays(){  
                 for(set<Display*>::iterator pos=displays.begin();pos!=displays.end();pos++){
  //cout << (*pos)->getName() << endl;
-                       if ( ((*pos)->isCalcMultiple() == true) || ( ((*pos)->isCalcMultiple() == false) && (shared.size() == 2) ) ) {
+                       if ( ( ((*pos)->isCalcMultiple() == true) && ((*pos)->getAll() == true) ) || (shared.size() == 2)  ) {
                                 (*pos)->update(shared, NumSeqs, NumGroupComb);
                         }
                 }       
diff --git a/collectsharedcommand.cpp b/collectsharedcommand.cpp

index 844b138c1916d5be2cffe199ace4639b56945bd9..a772eb18354fcbd109f2bcf258fad1d445a7f5e6 100644 (file)
--- a/collectsharedcommand.cpp
+++ b/collectsharedcommand.cpp
@@ -50,7 +50,7 @@ CollectSharedCommand::CollectSharedCommand(string option){
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"freq","label","calc","groups"};
+                       string Array[] =  {"freq","label","calc","groups","all"};
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -102,6 +102,9 @@ CollectSharedCommand::CollectSharedCommand(string option){
                         string temp;
                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
                         convert(temp, freq); 
+                       
+                       temp = validParameter.validFile(parameters, "all", false);                              if (temp == "not found") { temp = "true"; }
+                       all = isTrue(temp);
                                                 
                         if (abort == false) {
                         
@@ -181,6 +184,8 @@ void CollectSharedCommand::help(){
                 mothurOut("The default value for groups is all the groups in your groupfile.\n");
                 validCalculator->printCalc("shared", cout);
                 mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+               mothurOut("The all parameter is used to specify if you want the estimate of all your groups together.  This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+               mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
                 mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n");
                 mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n");
                 
@@ -212,6 +217,7 @@ int CollectSharedCommand::execute(){
                 
                 //if the users entered no valid calculators don't execute command
                 if (cDisplays.size() == 0) { return 0; }
+               for(int i=0;i<cDisplays.size();i++){    cDisplays[i]->setAll(all);      }       
                 
                 read = new ReadOTUFile(globaldata->inputFileName);      
                 read->read(&*globaldata); 
diff --git a/collectsharedcommand.h b/collectsharedcommand.h

index e8b7b96a608453f5c5c7c6e510cd9b96bacd661b..139a0cff7dfb3220db697d582ae037a99a3b0d98 100644 (file)
--- a/collectsharedcommand.h
+++ b/collectsharedcommand.h
@@ -45,7 +45,7 @@ private:
         int freq;
         string format;
  
-       bool abort, allLines;
+       bool abort, allLines, all;
         set<string> labels; //holds labels to be used
         string label, calc, groups;
         vector<string>  Estimators, Groups;
diff --git a/database.hpp b/database.hpp

index 01916d1924bb86466798dc2667f83aaeda174c3d..f52cdd58be86d31be87951df68d13983dc9bc921 100644 (file)
--- a/database.hpp
+++ b/database.hpp
@@ -52,6 +52,9 @@ public:
         virtual int getLongestBase(); 
         virtual void readKmerDB(ifstream&){};
         virtual void setNumSeqs(int i) {        numSeqs = i;    }
+       virtual vector<int> getSequencesWithKmer(int){ vector<int> filler; return filler; };  
+       virtual int getMaxKmer(){       return 1;       };
+
         
  protected:
         int numSeqs, longest;
diff --git a/display.h b/display.h

index 398d68470cbf0f957c0b24c733bde6cde823a8d0..5963975aeb61487f8d49b73aff2a817e2fb23d2d 100644 (file)
--- a/display.h
+++ b/display.h
@@ -18,6 +18,8 @@ public:
         virtual void reset() = 0;
         virtual void close() = 0;
         virtual bool isCalcMultiple() = 0;
+       virtual void setAll(bool){}
+       virtual bool getAll()   {       bool a; return a;       }
         virtual string getName() { return ""; };
         virtual ~Display() {}
         
diff --git a/doTaxonomy.cpp b/doTaxonomy.cpp

index e66476415aca0f7d2b032e9be43845c7a3e561b6..06337b446bf0d79ffad7b29f4fb904c99f54058e 100644 (file)
--- a/doTaxonomy.cpp
+++ b/doTaxonomy.cpp
@@ -59,11 +59,13 @@ void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                 if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
                         currentNode = childPointer->second;
                         tree[currentNode].accessions.push_back(seqName);
+                       name2Taxonomy[seqName] = currentNode;
                 }
                 else{                                                                                   //otherwise, create it
                         tree.push_back(TaxNode(taxon));
                         numNodes++;
                         tree[currentNode].children[taxon] = numNodes-1;
+                       tree[numNodes-1].parent = currentNode;
  
  //                     int numChildren = tree[currentNode].children.size();
  //                     string heirarchyID = tree[currentNode].heirarchyID;
@@ -71,15 +73,30 @@ void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                         
                         currentNode = tree[currentNode].children[taxon];
                         tree[currentNode].accessions.push_back(seqName);
-
+                       name2Taxonomy[seqName] = currentNode;
  //                     tree[currentNode].level = level;
  //                     tree[currentNode].childNumber = numChildren;
  //                     tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
                 }
-
+               
+               if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
+       }
+}
+/**************************************************************************************************/
+vector<int> PhyloTree::getGenusNodes() {
+       try {
+               genusIndex.clear();
+               //generate genusIndexes
+               map<int, int>::iterator it2;
+               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
+               
+               return genusIndex;
+       }
+       catch(exception& e) {
+               errorOut(e, "PhyloTree", "getGenusNodes");
+               exit(1);
         }
  }
-
  /**************************************************************************************************/
  
  void PhyloTree::assignHeirarchyIDs(int index){
@@ -92,9 +109,7 @@ void PhyloTree::assignHeirarchyIDs(int index){
                 counter++;
                 tree[it->second].level = tree[index].level + 1;
                 assignHeirarchyIDs(it->second);
-
         }
-       
  }
  
  /**************************************************************************************************/
diff --git a/doTaxonomy.h b/doTaxonomy.h

index c1ec3adfd0d81cf4f84893165f7b4dc2d36fa61c..6901d2757fb6630b980641c0de0dfb8390ac3f52 100644 (file)
--- a/doTaxonomy.h
+++ b/doTaxonomy.h
@@ -1,3 +1,6 @@
+#ifndef DOTAXONOMY_H
+#define DOTAXONOMY_H
+
  /*
   *  doTaxonomy.h
   *  
@@ -12,8 +15,8 @@
  /**************************************************************************************************/
  
  struct TaxNode {
-       vector<string> accessions;
-       map<string, int> children;
+       vector<string> accessions;      //names of seqs in this branch of tree
+       map<string, int> children;  //childs name to index in tree
         int parent, childNumber, level;
         string name, heirarchyID;
         
@@ -30,9 +33,17 @@ public:
         void addSeqToTree(string, string);
         void assignHeirarchyIDs(int);
         void print(ofstream&);
+       vector<int> getGenusNodes();    
+       TaxNode get(int i)                              {       return tree[i]; }
+       TaxNode get(string seqName)             {       return tree[name2Taxonomy[seqName]];    }
+       int getIndex(string seqName)    {       return name2Taxonomy[seqName];  }
+       string getName(int i)                   {       return tree[i].name;    }
  private:
         string getNextTaxon(string&);
         vector<TaxNode> tree;
+       vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+       map<string, int> name2Taxonomy;  //maps name to index in tree
+       map<int, int> uniqueTaxonomies;  //map of unique taxonomies
         void print(int, ofstream&);
         int numNodes;
         int numSeqs;
@@ -40,3 +51,6 @@ private:
  
  /**************************************************************************************************/
  
+#endif
+
+
diff --git a/hclustercommand.cpp b/hclustercommand.cpp

index e140f8c3489cf45615a200609caa08d00a23979d..2ad6333bd8586b433c91e9bb56111286b8662839 100644 (file)
--- a/hclustercommand.cpp
+++ b/hclustercommand.cpp
@@ -53,8 +53,8 @@ HClusterCommand::HClusterCommand(string option){
                         if (namefile == "not open") { abort = true; }   
                         else if (namefile == "not found") { namefile = ""; }
                         
-                       if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a cluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
-                       else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+                       if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a hcluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+                       else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
                 
                         if (columnfile != "") {
                                 if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
@@ -114,11 +114,12 @@ HClusterCommand::HClusterCommand(string option){
  
  void HClusterCommand::help(){
         try {
-               mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
-               mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
-               mothurOut("The cluster command should be in the following format: \n");
-               mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-               mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n"); 
+               mothurOut("The hcluster command parameter options are cutoff, precision, method, showabund, timing, phylip, column, name and sorted. Phylip or column and name are required.\n");
+               mothurOut("The phylip and column parameter allow you to enter your distance file, and sorted indicates whether your column distance file is already sorted. \n");
+               mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+               mothurOut("The hcluster command should be in the following format: \n");
+               mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+               mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");    
         }
         catch(exception& e) {
                 errorOut(e, "HClusterCommand", "help");
diff --git a/kmerdb.cpp b/kmerdb.cpp

index bf8679e57a932af0567df8a456f038b76fbf0fae..2560a443044016d8096f7ce2f7cf0636291e97c1 100644 (file)
--- a/kmerdb.cpp
+++ b/kmerdb.cpp
@@ -176,3 +176,36 @@ void KmerDB::readKmerDB(ifstream& kmerDBFile){
  }
  
  /**************************************************************************************************/
+int KmerDB::getCount(int kmer) {
+       try {
+               if (kmer < 0) { return 0; }  //if user gives negative number
+               else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
+               else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
+       }
+       catch(exception& e) {
+               errorOut(e, "KmerDB", "getCount");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+vector<int> KmerDB::getSequencesWithKmer(int kmer) {
+       try {
+               
+               vector<int> seqs;
+       
+               if (kmer < 0) { }  //if user gives negative number
+               else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
+               else {  seqs = kmerLocations[kmer];     }
+               
+               return seqs;
+       }
+       catch(exception& e) {
+               errorOut(e, "KmerDB", "getSequencesWithKmer");
+               exit(1);
+       }       
+}
+
+
+/**************************************************************************************************/
+
+
diff --git a/kmerdb.hpp b/kmerdb.hpp

index b64412d259a85870c57253249a4e355193476c9a..bdd9ca503910e6b764c3c067b5d9305409fafd6f 100644 (file)
--- a/kmerdb.hpp
+++ b/kmerdb.hpp
@@ -32,6 +32,9 @@ public:
         void addSequence(Sequence);
         vector<int> findClosestSequences(Sequence*, int);
         void readKmerDB(ifstream&);
+       int getCount(int);  //returns number of sequences with that kmer number
+       vector<int> getSequencesWithKmer(int);  //returns vector of sequences that contain kmer passed in
+       int getMaxKmer() { return maxKmer; }
         
  private:
         
diff --git a/knn.cpp b/knn.cpp

index 6d57fb135f14398fb6b7ab776eb81e84e4728406..0a62e743b0dd8d4f5435d504127a70724f9f27a8 100644 (file)
--- a/knn.cpp
+++ b/knn.cpp
@@ -60,18 +60,7 @@ string Knn::findCommonTaxonomy(vector<string> closest)  {
                 
                         string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
                 
-                       tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
-       
-                       //parse taxonomy
-                       string individual;
-                       while (tax.find_first_of(';') != -1) {
-                               individual = tax.substr(0,tax.find_first_of(';'));
-                               tax = tax.substr(tax.find_first_of(';')+1, tax.length());
-                               taxons[i].push_back(individual);
-                       
-                       }
-                       //get last one
-                       taxons[i].push_back(tax);
+                       taxons[i] = parseTax(tax);
                 
                         //figure out who has the shortest taxonomy info. so you can start comparing there
                         if (taxons[i].size() < smallest) {
diff --git a/mothur.h b/mothur.h

index 23b9f189c992f97e2e9a47823cacce62db363696..ba88f71fabffced9509deed545cecde7228b2b63 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -70,6 +70,9 @@ struct IntNode {
         int rcoef;
         IntNode* left;
         IntNode* right;
+       
+       IntNode(int lv, int rv, IntNode* l, IntNode* r) : lvalue(lv), rvalue(rv), left(l), right(r) {};
+       IntNode() {};
  };
  
  struct ThreadNode {
diff --git a/phylotype.cpp b/phylotype.cpp

deleted file mode 100644 (file)

index f7af97a..0000000
--- a/phylotype.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  phylotype.cpp
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "phylotype.h"
-
-/**************************************************************************************************/
-PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) 
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch)  {}
-/**************************************************************************************************/
-string PhyloType::getTaxonomy(Sequence* seq) {
-       try {
-               string tax;
-               
-               //use database to find closest seq
-               vector<int> closest = database->findClosestSequences(seq, 1);
-               
-               //find that sequences taxonomy in map
-               it = taxonomy.find(names[closest[0]]);
-               
-               //is this sequence in the taxonomy file
-               if (it == taxonomy.end()) { //error not in file
-                       mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file.  It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine();
-                       tax = "bad seq";
-               }else {
-                       tax = it->second;
-               }
-               
-               return tax;     
-       }
-       catch(exception& e) {
-               errorOut(e, "PhyloType", "getTaxonomy");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
diff --git a/phylotype.h b/phylotype.h

deleted file mode 100644 (file)

index ee649e3..0000000
--- a/phylotype.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef PHYLOTYPE_H
-#define PHYLOTYPE_H
-
-/*
- *  phylotype.h
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "mothur.h"
-#include "classify.h"
-
-/**************************************************************************************************/
-
-class PhyloType : public Classify {
-       
-public:
-       PhyloType(string, string, string, int, int, int, int, int);
-       ~PhyloType() {};
-       
-       string getTaxonomy(Sequence*);
-       
-       
-private:
-       
-};
-
-/**************************************************************************************************/
-
-#endif
-
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp

index 82a1d2473a0bc35282abd46a5998ae65fbcce1bf..4eafaf8ed15e7fde8c60f9c3d80a376f1d796f3d 100644 (file)
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -120,6 +120,10 @@ int SeqSummaryCommand::execute(){
                 int ptile97_5   = int(numSeqs * 0.975);
                 int ptile100    = numSeqs - 1;
                 
+               //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
+               if (startPosition[0] == -1) {  startPosition[0] = 0;    }
+               if (endPosition[0] == -1)       {  endPosition[0] = 0;          }
+               
                 mothurOutEndLine();
                 mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); mothurOutEndLine();
                 mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
diff --git a/sharedchao1.cpp b/sharedchao1.cpp

index a28f6092e8269bca28e5269952772758883ee64d..c19f0322c1358ba6718485a24933578c02bd5a3f 100644 (file)
--- a/sharedchao1.cpp
+++ b/sharedchao1.cpp
@@ -27,8 +27,7 @@ EstOutput SharedChao1::getValues(vector<SharedRAbundVector*> shared){
                 // the coeffient is 2.  Note: we only set the coeffient in f2 values.
                 
                 //create and initialize trees to 0.
-               initialTree(numGroups); 
-               
+               initialTree(numGroups);
                 
                 for (int i = 0; i < shared[0]->size(); i++) {
                         //get bin values and calc shared 
@@ -46,7 +45,7 @@ EstOutput SharedChao1::getValues(vector<SharedRAbundVector*> shared){
                         }
                 }
  
-                       
+               
                 //calculate chao1, (numleaves-1) because numleaves contains the ++ values.
                 bool bias = false;
                 for(int i=0;i<numLeaves;i++){
@@ -108,39 +107,30 @@ void SharedChao1::initialTree(int n) {
                 
                 f1leaves.resize(numNodes);
                 f2leaves.resize(numNodes);
-               
+       
                 //initialize leaf values
                 for (int i = 0; i < numLeaves; i++) {
-                       f1leaves[i] = new IntNode;
-                       f1leaves[i]->lvalue = 0;
-                       f1leaves[i]->rvalue = 0;
-                       f1leaves[i]->left = NULL;
-                       f1leaves[i]->right = NULL;
-                       
-                       f2leaves[i] = new IntNode;
-                       f2leaves[i]->lvalue = 0;
-                       f2leaves[i]->rvalue = 0;
-                       f2leaves[i]->left = NULL;
-                       f2leaves[i]->right = NULL;
+                       f1leaves[i] = new IntNode(0, 0, NULL, NULL);
+                       f2leaves[i] = new IntNode(0, 0, NULL, NULL);
                 }
                 
                 //set pointers to children
                 for (int j = numLeaves; j < numNodes; j++) {
-                       f1leaves[j] = new IntNode;
+                       f1leaves[j] = new IntNode();
                         f1leaves[j]->left = f1leaves[countleft];
                         f1leaves[j]->right = f1leaves[countright];
                                                 
-                       f2leaves[j] = new IntNode;
+                       f2leaves[j] = new IntNode();
                         f2leaves[j]->left = f2leaves[countleft];
                         f2leaves[j]->right =f2leaves[countright];
-                                               
+                       
                         countleft = countleft + 2;
                         countright = countright + 2;
                 }
                 
                 //point to root
                 f1root = f1leaves[numNodes-1];
-               
+       
                 //point to root
                 f2root = f2leaves[numNodes-1];
                 
@@ -148,6 +138,7 @@ void SharedChao1::initialTree(int n) {
                 setCoef(f2root, 0);
         }
         catch(exception& e) {
+               if ((toString(e.what()) == "vector::_M_fill_insert") || (toString(e.what()) == "St9bad_alloc")) { mothurOut("You are using " + toString(n) + " groups which creates 2^" + toString(n+1) + " nodes. Try reducing the number of groups you selected. "); mothurOutEndLine(); exit(1); }
                 errorOut(e, "SharedChao1", "initialTree");
                 exit(1);
         }
diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp

index 575c0ded2d1a871289fecaefe96acb5fd1a36f70..65fcc9a161994da8714815d2a605ca1e96b9630a 100644 (file)
--- a/summarysharedcommand.cpp
+++ b/summarysharedcommand.cpp
@@ -48,7 +48,7 @@ SummarySharedCommand::SummarySharedCommand(string option){
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"label","calc","groups"};
+                       string Array[] =  {"label","calc","groups","all"};
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -95,6 +95,9 @@ SummarySharedCommand::SummarySharedCommand(string option){
                                 globaldata->Groups = Groups;
                         }
                         
+                       string temp = validParameter.validFile(parameters, "all", false);                               if (temp == "not found") { temp = "true"; }
+                       all = isTrue(temp);
+                       
                         if (abort == false) {
                         
                                 validCalculator = new ValidCalculators();
@@ -165,7 +168,7 @@ SummarySharedCommand::SummarySharedCommand(string option){
  void SummarySharedCommand::help(){
         try {
                 mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
-               mothurOut("The summary.shared command parameters are label and calc.  No parameters are required.\n");
+               mothurOut("The summary.shared command parameters are label, calc and all.  No parameters are required.\n");
                 mothurOut("The summary.shared command should be in the following format: \n");
                 mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
                 mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
@@ -173,6 +176,8 @@ void SummarySharedCommand::help(){
                 mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
                 mothurOut("The default value for groups is all the groups in your groupfile.\n");
                 mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+               mothurOut("The all parameter is used to specify if you want the estimate of all your groups together.  This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+               mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
                 mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n");
                 mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
         }
@@ -202,8 +207,10 @@ int SummarySharedCommand::execute(){
                 if (sumCalculators.size() == 0) { return 0; }
                 //check if any calcs can do multiples
                 else{
-                       for (int i = 0; i < sumCalculators.size(); i++) {
-                               if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+                       if (all){ 
+                               for (int i = 0; i < sumCalculators.size(); i++) {
+                                       if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+                               }
                         }
                 }
                 
diff --git a/summarysharedcommand.h b/summarysharedcommand.h

index 63197e199493aba98106c43378340e4e394fdda1..090c9f4ef9ed40287040bec733f0e5d8802ff23d 100644 (file)
--- a/summarysharedcommand.h
+++ b/summarysharedcommand.h
@@ -34,7 +34,7 @@ private:
         InputData* input;
         ValidCalculators* validCalculator;
         
-       bool abort, allLines, mult;
+       bool abort, allLines, mult, all;
         set<string> labels; //holds labels to be used
         string label, calc, groups;
         vector<string>  Estimators, Groups;
author	westcott <westcott>
	Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)
committer	westcott <westcott>
	Fri, 13 Nov 2009 16:01:14 +0000 (16:01 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
bayesian.cpp		patch \| blob \| history
bayesian.h		patch \| blob \| history
classify.cpp		patch \| blob \| history
classify.h		patch \| blob \| history
classifyseqscommand.cpp		patch \| blob \| history
classifyseqscommand.h		patch \| blob \| history
collect.cpp		patch \| blob \| history
collectdisplay.h		patch \| blob \| history
collectorscurvedata.h		patch \| blob \| history
collectsharedcommand.cpp		patch \| blob \| history
collectsharedcommand.h		patch \| blob \| history
database.hpp		patch \| blob \| history
display.h		patch \| blob \| history
doTaxonomy.cpp		patch \| blob \| history
doTaxonomy.h		patch \| blob \| history
hclustercommand.cpp		patch \| blob \| history
kmerdb.cpp		patch \| blob \| history
kmerdb.hpp		patch \| blob \| history
knn.cpp		patch \| blob \| history
mothur.h		patch \| blob \| history
phylotype.cpp	[deleted file]	patch \| blob \| history
phylotype.h	[deleted file]	patch \| blob \| history
seqsummarycommand.cpp		patch \| blob \| history
sharedchao1.cpp		patch \| blob \| history
summarysharedcommand.cpp		patch \| blob \| history
summarysharedcommand.h		patch \| blob \| history