From 7b3c9ca940891c1b20b3b7ec13e05d7e7b316b63 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Fri, 13 Nov 2009 16:01:14 +0000
Subject: [PATCH] finished work on classify.seqs bayesian method and various
 bug fixes

---
 Mothur.xcodeproj/project.pbxproj |   6 -
 bayesian.cpp                     | 279 ++++++++++++++++++++++++++++++-
 bayesian.h                       |  20 ++-
 classify.cpp                     |  48 +++++-
 classify.h                       |  16 +-
 classifyseqscommand.cpp          |  71 ++++++--
 classifyseqscommand.h            |   6 +-
 collect.cpp                      |   2 +-
 collectdisplay.h                 |  12 +-
 collectorscurvedata.h            |   2 +-
 collectsharedcommand.cpp         |   8 +-
 collectsharedcommand.h           |   2 +-
 database.hpp                     |   3 +
 display.h                        |   2 +
 doTaxonomy.cpp                   |  25 ++-
 doTaxonomy.h                     |  18 +-
 hclustercommand.cpp              |  15 +-
 kmerdb.cpp                       |  33 ++++
 kmerdb.hpp                       |   3 +
 knn.cpp                          |  13 +-
 mothur.h                         |   3 +
 phylotype.cpp                    |  42 -----
 phylotype.h                      |  34 ----
 seqsummarycommand.cpp            |   4 +
 sharedchao1.cpp                  |  29 ++--
 summarysharedcommand.cpp         |  15 +-
 summarysharedcommand.h           |   2 +-
 27 files changed, 541 insertions(+), 172 deletions(-)
 delete mode 100644 phylotype.cpp
 delete mode 100644 phylotype.h
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 9b064e0..27a8b91 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -170,7 +170,6 @@
 		A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
 		A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; };
 		A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; };
-		A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; };
 		A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; };
 		A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
 		A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
@@ -546,8 +545,6 @@
 		A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = "<group>"; };
 		A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; };
 		A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; };
-		A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; };
-		A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; };
 		A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = "<group>"; };
 		A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = "<group>"; };
 		A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = "<group>"; };
@@ -1042,8 +1039,6 @@
 				A78781B810A0813D0086103D /* doTaxonomy.cpp */,
 				A787844310A1EBDD0086103D /* knn.h */,
 				A787844410A1EBDD0086103D /* knn.cpp */,
-				A787811310A074330086103D /* phylotype.h */,
-				A787811410A074330086103D /* phylotype.cpp */,
 			);
 			name = classifiers;
 			sourceTree = "<group>";
@@ -1272,7 +1267,6 @@
 				A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
 				A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */,
 				A787810310A06D5D0086103D /* classify.cpp in Sources */,
-				A787811510A074330086103D /* phylotype.cpp in Sources */,
 				A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */,
 				A787821110A0AAE70086103D /* bayesian.cpp in Sources */,
 				A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */,
diff --git a/bayesian.cpp b/bayesian.cpp
index caae212..3a2d3e0 100644
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -8,15 +8,116 @@
  */
 
 #include "bayesian.h"
+#include "kmer.hpp"
 
 /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : 
-Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff) : 
+Classify(tfile, tempFile, method, ksize, 0, 0, 0, 0), kmerSize(ksize), confidenceThreshold(cutoff)  {
+	try {
+					
+		numKmers = database->getMaxKmer() + 1;
+		
+		//initialze probabilities
+		wordGenusProb.resize(numKmers);
+		
+		genusNodes = phyloTree->getGenusNodes(); 
+		
+		for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
+			
+		//reset counts because we are on a new word
+		for (int j = 0; j < genusNodes.size(); j++) {
+			TaxNode temp = phyloTree->get(genusNodes[j]);
+			genusTotals.push_back(temp.accessions.size());
+		}
+
+		
+		/************calculate the probablity that each word will be in a specific taxonomy*************/
+		ofstream out;
+		string probFileName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
+		ifstream probFileTest(probFileName.c_str());
+		
+		ofstream out2;
+		string probFileName2 = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+		ifstream probFileTest2(probFileName2.c_str());
+		
+		int start = time(NULL);
+		
+		if(probFileTest && probFileTest2){	
+			mothurOut("Reading template probabilities...     "); cout.flush();
+			readProbFile(probFileTest, probFileTest2);	
+		}else{
+			mothurOut("Calculating template probabilities...     "); cout.flush();
+
+			ofstream out;
+			openOutputFile(probFileName, out);
+			
+			ofstream out2;
+			openOutputFile(probFileName2, out2);
+			
+			//for each word
+			for (int i = 0; i < numKmers; i++) {
+				
+				out << i << '\t';
+				
+				vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
+				
+				map<int, int> count;
+				for (int k = 0; k < genusNodes.size(); k++) {  count[genusNodes[k]] = 0;  }			
+						
+				//for each sequence with that word
+				for (int j = 0; j < seqsWithWordi.size(); j++) {
+					int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+					count[temp]++;  //increment count of seq in this genus who have this word
+				}
+				
+				//probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
+				float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
+				
+				int numNotZero = 0;
+				for (int k = 0; k < genusNodes.size(); k++) {
+					//probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
+					wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
+					if (count[genusNodes[k]] != 0) {  out << k << '\t' << wordGenusProb[i][k] << '\t';  numNotZero++;  }
+				}
+				out << endl;
+				out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+			}
+			
+			out.close();
+			out2.close();
+		}
+		
+		
+		mothurOut("DONE."); mothurOutEndLine();
+		mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); mothurOutEndLine();
+	}
+	catch(exception& e) {
+		errorOut(e, "Bayesian", "getTaxonomy");
+		exit(1);
+	}
+}
 /**************************************************************************************************/
 string Bayesian::getTaxonomy(Sequence* seq) {
 	try {
 		string tax;
+		Kmer kmer(kmerSize);
 		
+		//get words contained in query
+		//getKmerString returns a string where the index in the string is hte kmer number 
+		//and the character at that index can be converted to be the number of times that kmer was seen
+		string queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
+		vector<int> queryKmers;
+		for (int i = 0; i < queryKmerString.length(); i++) {
+			if (queryKmerString[i] != '!') { //this kmer is in the query
+				queryKmers.push_back(i);
+			}
+		}
+	
+		int index = getMostProbableTaxonomy(queryKmers);
+					
+		//bootstrap - to set confidenceScore
+		int numToSelect = queryKmers.size() / 8;
+		tax = bootstrapResults(queryKmers, index, numToSelect);
 				
 		return tax;	
 	}
@@ -26,4 +127,178 @@ string Bayesian::getTaxonomy(Sequence* seq) {
 	}
 }
 /**************************************************************************************************/
+string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
+	try {
+		
+		//taxConfidenceScore.clear(); //clear out previous seqs scores
+				
+		vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
+										//map of classification to confidence for all areas seen
+									   //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
+									   //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
+		confidenceScores.resize(100);  //if you have more than 100 levels of classification...
+		
+		map<string, int>::iterator itBoot;
+		map<string, int>::iterator itBoot2;
+		map<int, int>::iterator itConvert;
+		
+		for (int i = 0; i < 100; i++) {
+			vector<int> temp;
+						
+			for (int j = 0; j < numToSelect; j++) {
+				int index = int(rand() % kmers.size());
+				
+				//add word to temp
+				temp.push_back(kmers[index]);
+			}
+			
+			//get taxonomy
+			int newTax = getMostProbableTaxonomy(temp);
+			TaxNode taxonomy = phyloTree->get(newTax);
+			
+			//add to confidence results
+			while (taxonomy.level != 0) { //while you are not at the root
+				
+				itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+				
+				if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
+					confidenceScores[taxonomy.level][taxonomy.name] = 1;
+				}else{
+					confidenceScores[taxonomy.level][taxonomy.name]++;
+				}
+			
+				taxonomy = phyloTree->get(taxonomy.parent);
+			}
+		}
+		
+		string confidenceTax = "";
+		simpleTax = "";
+		TaxNode seqTax = phyloTree->get(tax);
+		
+		while (seqTax.level != 0) { //while you are not at the root
+				
+				itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+				
+				int confidence = 0;
+				if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
+					confidence = confidenceScores[seqTax.level][seqTax.name];
+				}
+				
+				if (confidence >= confidenceThreshold) {
+					confidenceTax = seqTax.name + "(" + toString(confidence) + ");" + confidenceTax;
+					simpleTax = seqTax.name + ";" + simpleTax;
+				}
+				
+				seqTax = phyloTree->get(seqTax.parent);
+		}
+		
+		return confidenceTax;
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "Bayesian", "bootstrapResults");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
+	try {
+		int indexofGenus;
+		
+		double maxProbability = -1000000.0;
+		//find taxonomy with highest probability that this sequence is from it
+		for (int k = 0; k < genusNodes.size(); k++) {
+		
+			//for each taxonomy calc its probability
+			double prob = 1.0;
+			for (int i = 0; i < queryKmer.size(); i++) {
+				prob += wordGenusProb[queryKmer[i]][k];
+			}
+		
+			//is this the taxonomy with the greatest probability?
+			if (prob > maxProbability) { 
+				indexofGenus = genusNodes[k];
+				maxProbability = prob;
+			}
+		}
+
+		return indexofGenus;
+	}
+	catch(exception& e) {
+		errorOut(e, "Bayesian", "getMostProbableTaxonomy");
+		exit(1);
+	}
+}
+/*************************************************************************************************
+map<string, int> Bayesian::parseTaxMap(string newTax) {
+	try{
+	
+		map<string, int> parsed;
+		
+		newTax = newTax.substr(0, newTax.length()-1);  //get rid of last ';'
+	
+		//parse taxonomy
+		string individual;
+		while (newTax.find_first_of(';') != -1) {
+			individual = newTax.substr(0,newTax.find_first_of(';'));
+			newTax = newTax.substr(newTax.find_first_of(';')+1, newTax.length());
+			parsed[individual] = 1;
+		}
+		
+		//get last one
+		parsed[newTax] = 1;
+
+		return parsed;
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "Bayesian", "parseTax");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+	try{
+		
+		int kmer, name, count;  count = 0;
+		vector<int> num; num.resize(numKmers);
+		float prob;
+		vector<float> zeroCountProb; zeroCountProb.resize(numKmers);		
+		
+		while (inNum) {
+			inNum >> zeroCountProb[count] >> num[count];  
+			count++;
+			gobble(inNum);
+		}
+		inNum.close();
+		
+		while(in) {
+			in >> kmer;
+			
+			//set them all to zero value
+			for (int i = 0; i < genusNodes.size(); i++) {
+				wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+			}
+			
+			//get probs for nonzero values
+			for (int i = 0; i < num[kmer]; i++) {
+				in >> name >> prob;
+				wordGenusProb[kmer][name] = prob;
+			}
+			
+			gobble(in);
+		}
+		in.close();
+	}
+	catch(exception& e) {
+		errorOut(e, "Bayesian", "readProbFile");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+
+
+
+
 
diff --git a/bayesian.h b/bayesian.h
index cbe949f..9b93c83 100644
--- a/bayesian.h
+++ b/bayesian.h
@@ -18,13 +18,29 @@
 class Bayesian : public Classify {
 	
 public:
-	Bayesian(string, string, string, int, int, int, int, int);
+	Bayesian(string, string, string, int, int);
 	~Bayesian() {};
 	
 	string getTaxonomy(Sequence*);
-	
+	//map<string, int> getConfidenceScores() { return taxConfidenceScore; }
 	
 private:
+	//map<string, vector<double> > taxonomyProbability;	//probability of a word being in a given taxonomy. 
+													//taxonomyProbability[bacteria;][0] = probabtility that a sequence within bacteria; would contain kmer 0;
+													//taxonomyProbability[bacteria;][1] = probabtility that a sequence within bacteria; would contain kmer 1...
+	
+	vector< vector<float> > wordGenusProb;	//vector of maps from genus to probability
+												//wordGenusProb[0][392] = probability that a sequence within genus that's index in the tree is 392 would contain kmer 0;
+	
+	vector<int> genusTotals;
+	vector<int> genusNodes;  //indexes in phyloTree where genus' are located
+	
+	int kmerSize, numKmers, confidenceThreshold;
+	
+	string bootstrapResults(vector<int>, int, int);
+	int getMostProbableTaxonomy(vector<int>);
+	void readProbFile(ifstream&, ifstream&);
+	//map<string, int> parseTaxMap(string);
 	
 };
 
diff --git a/classify.cpp b/classify.cpp
index 605c1f8..f6b7e80 100644
--- a/classify.cpp
+++ b/classify.cpp
@@ -17,9 +17,9 @@
 
 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) {		
 	try {											
+		readTaxonomy(taxFile);	
 		
-		readTaxonomy(taxFile);
-		
+		int start = time(NULL);
 		int numSeqs = 0;
 		//need to know number of template seqs for suffixdb
 		if (method == "suffix") {
@@ -51,12 +51,13 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
 		if (needToGenerate) {
 			ifstream fastaFile;
 			openInputFile(tempFile, fastaFile);
-		
+			
 			while (!fastaFile.eof()) {
 				Sequence temp(fastaFile);
 				gobble(fastaFile);
 			
 				names.push_back(temp.getName());
+								
 				database->addSequence(temp);	
 			}
 			fastaFile.close();
@@ -69,7 +70,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
 			
 			ifstream fastaFile;
 			openInputFile(tempFile, fastaFile);
-		
+			
 			while (!fastaFile.eof()) {
 				Sequence temp(fastaFile);
 				gobble(fastaFile);
@@ -82,6 +83,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
 		database->setNumSeqs(names.size());
 		
 		mothurOut("DONE."); mothurOutEndLine();
+		mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); mothurOutEndLine();
 
 	}
 	catch(exception& e) {
@@ -93,7 +95,9 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i
 
 void Classify::readTaxonomy(string file) {
 	try {
-	
+		
+		phyloTree = new PhyloTree();
+		
 		ifstream inTax;
 		openInputFile(file, inTax);
 	
@@ -106,9 +110,17 @@ void Classify::readTaxonomy(string file) {
 			inTax >> name >> taxInfo;
 			
 			taxonomy[name] = taxInfo;
+			
+			//itTax = taxList.find(taxInfo);
+			//if (itTax == taxList.end()) { //this is new taxonomy
+				//taxList[taxInfo] = 1;
+			//}else { taxList[taxInfo]++;	}
+			phyloTree->addSeqToTree(name, taxInfo);
 		
 			gobble(inTax);
 		}
+		
+		phyloTree->assignHeirarchyIDs(0);
 		inTax.close();
 	
 		mothurOut("DONE.");
@@ -122,3 +134,29 @@ void Classify::readTaxonomy(string file) {
 }
 /**************************************************************************************************/
 
+vector<string> Classify::parseTax(string tax) {
+	try {
+		vector<string> taxons;
+		
+		tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
+	
+		//parse taxonomy
+		string individual;
+		while (tax.find_first_of(';') != -1) {
+			individual = tax.substr(0,tax.find_first_of(';'));
+			tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+			taxons.push_back(individual);
+			
+		}
+		//get last one
+		taxons.push_back(tax);
+		
+		return taxons;
+	}
+	catch(exception& e) {
+		errorOut(e, "Classify", "parseTax");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
diff --git a/classify.h b/classify.h
index 2926800..0314235 100644
--- a/classify.h
+++ b/classify.h
@@ -15,33 +15,41 @@
 
 #include "mothur.h"
 #include "database.hpp"
-
+#include "doTaxonomy.h"
 
 
 class Sequence;
 
+
 /**************************************************************************************************/
 
 class Classify {
 
 public:
 	Classify(string, string, string, int, int, int, int, int);
-	Classify(){};
+	Classify(){	delete phyloTree;  }
 	
 	virtual ~Classify(){};
 	virtual string getTaxonomy(Sequence*) = 0;
+	//virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
+	//virtual vector<string> parseTax(string);
+	virtual string getSimpleTax()  { return simpleTax;	}
 	
 protected:
 
 	map<string, string> taxonomy;  //name maps to taxonomy
+	//map<string, int> genusCount;  //maps genus to count - in essence a list of how many seqs are in each taxonomy
+	map<string, int>::iterator itTax;
 	map<string, string>::iterator it;
 	Database* database;
+	PhyloTree* phyloTree;
 	
-	string taxFile, templateFile;
+	string taxFile, templateFile, simpleTax;
 	vector<string> names;
+	//map<string, int> taxConfidenceScore;
 	
 	void readTaxonomy(string);
-		
+	vector<string> parseTax(string);
 };
 
 /**************************************************************************************************/
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index 69ef712..2564293 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -9,7 +9,6 @@
 
 #include "classifyseqscommand.h"
 #include "sequence.hpp"
-#include "phylotype.h"
 #include "bayesian.h"
 #include "doTaxonomy.h"
 #include "knn.h"
@@ -27,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 		else {
 			
 			//valid paramters for this command
-			string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+			string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
 			vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -77,7 +76,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 			
 			search = validParameter.validFile(parameters, "search", false);		if (search == "not found"){	search = "kmer";		}
 			
-			method = validParameter.validFile(parameters, "method", false);		if (method == "not found"){	method = "phylotype";	}
+			method = validParameter.validFile(parameters, "method", false);		if (method == "not found"){	method = "bayesian";	}
 			
 			temp = validParameter.validFile(parameters, "match", false);		if (temp == "not found"){	temp = "1.0";			}
 			convert(temp, match);  
@@ -93,7 +92,15 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 			
 			temp = validParameter.validFile(parameters, "numwanted", false);	if (temp == "not found"){	temp = "10";			}
 			convert(temp, numWanted);
+			
+			temp = validParameter.validFile(parameters, "cutoff", false);		if (temp == "not found"){	temp = "0";				}
+			convert(temp, cutoff);
 
+			
+			if ((method == "bayesian") && (search != "kmer"))  { 
+				mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
+				search = "kmer";
+			}
 		}
 		
 	}
@@ -120,7 +127,7 @@ void ClassifySeqsCommand::help(){
 		mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
 		mothurOut("The template, fasta and taxonomy parameters are required.\n");
 		mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
-		mothurOut("The method parameter allows you to specify classification method to use.  Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+		mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
 		mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
 		mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
 		mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
@@ -128,9 +135,10 @@ void ClassifySeqsCommand::help(){
 		mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
 		mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
 		mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
+		mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
 		mothurOut("The classify.seqs command should be in the following format: \n");
 		mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
-		mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+		mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
 		mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
 		mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
 		mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
@@ -148,22 +156,23 @@ int ClassifySeqsCommand::execute(){
 	try {
 		if (abort == true) {	return 0;	}
 		
-		int start = time(NULL);
 		
-		if(method == "phylotype"){	classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);					}
-		//else if(method == "bayesian")		{	classify = new Bayesian(taxonomyFileName, templateFileName, search);		}
+		
+		if(method == "bayesian")			{	classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);		}
 		else if(method == "knn")			{	classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);				}
 		else {
-			mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+			mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
 			mothurOutEndLine();
-			classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+			classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);	
 		}
 
 		int numFastaSeqs = 0;
 		
 		string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+		string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
 		string taxSummary = getRootName(fastaFileName) + "tax.summary";
 		
+		int start = time(NULL);
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 		if(processors == 1){
 			ifstream inFASTA;
@@ -173,7 +182,7 @@ int ClassifySeqsCommand::execute(){
 			
 			lines.push_back(new linePair(0, numFastaSeqs));
 			
-			driver(lines[0], newTaxonomyFile);
+			driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
 			
 		}
 		else{
@@ -203,13 +212,16 @@ int ClassifySeqsCommand::execute(){
 				}
 				lines.push_back(new linePair(startPos, numSeqsPerProcessor));
 			}
-			createProcesses(newTaxonomyFile); 
+			createProcesses(newTaxonomyFile, tempTaxonomyFile); 
 			
 			rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+			rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
 			
 			for(int i=1;i<processors;i++){
 				appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+				appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
 				remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+				remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
 			}
 			
 		}
@@ -221,14 +233,14 @@ int ClassifySeqsCommand::execute(){
 		
 		lines.push_back(new linePair(0, numFastaSeqs));
 		
-		driver(lines[0], newTaxonomyFile);
+		driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
 #endif
 		
 		delete classify;
 		
 		//make taxonomy tree from new taxonomy file 
 		ifstream inTaxonomy;
-		openInputFile(newTaxonomyFile, inTaxonomy);
+		openInputFile(tempTaxonomyFile, inTaxonomy);
 		
 		string accession, taxaList;
 		PhyloTree taxaBrowser;
@@ -242,6 +254,7 @@ int ClassifySeqsCommand::execute(){
 			gobble(inTaxonomy);
 		}
 		inTaxonomy.close();
+		remove(tempTaxonomyFile.c_str());
 		
 		taxaBrowser.assignHeirarchyIDs(0);
 		
@@ -264,7 +277,7 @@ int ClassifySeqsCommand::execute(){
 }
 /**************************************************************************************************/
 
-void ClassifySeqsCommand::createProcesses(string taxFileName) {
+void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
 	try {
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 		int process = 0;
@@ -278,7 +291,7 @@ void ClassifySeqsCommand::createProcesses(string taxFileName) {
 				processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 				process++;
 			}else if (pid == 0){
-				driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+				driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
 				exit(0);
 			}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
 		}
@@ -321,10 +334,13 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
 
 //**********************************************************************************************************************
 
-int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
 	try {
 		ofstream outTax;
 		openOutputFile(taxFName, outTax);
+		
+		ofstream outTaxSimple;
+		openOutputFile(tempTFName, outTaxSimple);
 	
 		ifstream inFASTA;
 		openInputFile(fastaFileName, inFASTA);
@@ -340,14 +356,33 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName){
 			taxonomy = classify->getTaxonomy(candidateSeq);
 			
 			if (taxonomy != "bad seq") {
-				outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+				//if (method != "bayesian") {
+					outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+					outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+				//}else{
+					//vector<string> pTax = classify->parseTax(taxonomy);
+					//map<string, int> confidence = classify->getConfidenceScores();
+					
+					//outTax << candidateSeq->getName() << '\t';
+					//for (int j = 0; j < pTax.size(); j++) {
+						//if (confidence[pTax[j]] > cutoff) {
+						//	outTax << pTax[j] << "(" << confidence[pTax[j]] << ");";
+						//}else{ break; }
+					//}
+					//outTax << endl;
+				//}
 			}
 							
 			delete candidateSeq;
+			
+			if(i % 100 == 0){
+				mothurOut("Classifying sequence " + toString(i)); mothurOutEndLine();
+			}
 		}
 		
 		inFASTA.close();
 		outTax.close();
+		outTaxSimple.close();
 		
 		return 1;
 	}
diff --git a/classifyseqscommand.h b/classifyseqscommand.h
index a77b7d1..a73aa0c 100644
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -35,13 +35,13 @@ private:
 	Classify* classify;
 	
 	string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
-	int processors, kmerSize, numWanted;
+	int processors, kmerSize, numWanted, cutoff;
 	float match, misMatch, gapOpen, gapExtend;
 	bool abort;
 	
-	int driver(linePair*, string);
+	int driver(linePair*, string, string);
 	void appendTaxFiles(string, string);
-	void createProcesses(string); 
+	void createProcesses(string, string); 
 };
 
 #endif
diff --git a/collect.cpp b/collect.cpp
index 77d279d..c270ae1 100644
--- a/collect.cpp
+++ b/collect.cpp
@@ -88,7 +88,7 @@ try {
 				
                 for(int i=0;i<displays.size();i++){
                         ccd->registerDisplay(displays[i]); //adds a display[i] to cdd
-						if (displays[i]->isCalcMultiple() == true)  {   displays[i]->init(groupLabelAll); }
+						if ((displays[i]->isCalcMultiple() == true) && (displays[i]->getAll() == true)) {   displays[i]->init(groupLabelAll); }
 						else {  displays[i]->init(groupLabel);  }           
                 }
                 
diff --git a/collectdisplay.h b/collectdisplay.h
index 0f18da6..15cc5a7 100644
--- a/collectdisplay.h
+++ b/collectdisplay.h
@@ -15,6 +15,8 @@ class CollectDisplay : public Display {
 public:
 	CollectDisplay(Calculator* calc, FileOutput* file) : estimate(calc), output(file) {timesCalled = 0;};
 	~CollectDisplay()	{	delete estimate; delete output;		}
+	
+	
 	void update(SAbundVector* rank){
 		nSeqs=rank->getNumSeqs();
 		data = estimate->getValues(rank);
@@ -45,7 +47,7 @@ public:
 			n++;
 		}
 		
-		if (estimate->getMultiple() == true) { 
+		if ((estimate->getMultiple() == true) && all) { 
 			numGroupComb++; 
 			groupData.resize((numGroupComb*data.size()), 0);
 			//is this the time its called with all values
@@ -74,10 +76,14 @@ public:
 	void init(string s)		{	output->initFile(s);	};
 	void reset()			{	output->resetFile();	};
 	void close()			{	output->resetFile();	};
+	void setAll(bool a)		{	all = a;				}
+	bool getAll()			{	return all;				}
+	
 	bool isCalcMultiple() { return estimate->getMultiple(); }
 	
 	string getName()	{  return estimate->getName();  }
-
+	
+	
 private:
 	
 	Calculator* estimate;
@@ -85,6 +91,8 @@ private:
 	int nSeqs, timesCalled, numGroupComb;
 	vector<double> data;
 	vector<double> groupData;
+	bool all;
+	
 };
 
 /***********************************************************************/
diff --git a/collectorscurvedata.h b/collectorscurvedata.h
index d90e84a..9c70e7a 100644
--- a/collectorscurvedata.h
+++ b/collectorscurvedata.h
@@ -48,7 +48,7 @@ public:
 	void notifyDisplays(){	
 		for(set<Display*>::iterator pos=displays.begin();pos!=displays.end();pos++){
 //cout << (*pos)->getName() << endl;
-			if ( ((*pos)->isCalcMultiple() == true) || ( ((*pos)->isCalcMultiple() == false) && (shared.size() == 2) ) ) {
+			if ( ( ((*pos)->isCalcMultiple() == true) && ((*pos)->getAll() == true) ) || (shared.size() == 2)  ) {
 				(*pos)->update(shared, NumSeqs, NumGroupComb);
 			}
 		}	
diff --git a/collectsharedcommand.cpp b/collectsharedcommand.cpp
index 844b138..a772eb1 100644
--- a/collectsharedcommand.cpp
+++ b/collectsharedcommand.cpp
@@ -50,7 +50,7 @@ CollectSharedCommand::CollectSharedCommand(string option){
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"freq","label","calc","groups"};
+			string Array[] =  {"freq","label","calc","groups","all"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -102,6 +102,9 @@ CollectSharedCommand::CollectSharedCommand(string option){
 			string temp;
 			temp = validParameter.validFile(parameters, "freq", false);			if (temp == "not found") { temp = "100"; }
 			convert(temp, freq); 
+			
+			temp = validParameter.validFile(parameters, "all", false);				if (temp == "not found") { temp = "true"; }
+			all = isTrue(temp);
 						
 			if (abort == false) {
 			
@@ -181,6 +184,8 @@ void CollectSharedCommand::help(){
 		mothurOut("The default value for groups is all the groups in your groupfile.\n");
 		validCalculator->printCalc("shared", cout);
 		mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+		mothurOut("The all parameter is used to specify if you want the estimate of all your groups together.  This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+		mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
 		mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n");
 		mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n");
 		
@@ -212,6 +217,7 @@ int CollectSharedCommand::execute(){
 		
 		//if the users entered no valid calculators don't execute command
 		if (cDisplays.size() == 0) { return 0; }
+		for(int i=0;i<cDisplays.size();i++){	cDisplays[i]->setAll(all);	}	
 		
 		read = new ReadOTUFile(globaldata->inputFileName);	
 		read->read(&*globaldata); 
diff --git a/collectsharedcommand.h b/collectsharedcommand.h
index e8b7b96..139a0cf 100644
--- a/collectsharedcommand.h
+++ b/collectsharedcommand.h
@@ -45,7 +45,7 @@ private:
 	int freq;
 	string format;
 
-	bool abort, allLines;
+	bool abort, allLines, all;
 	set<string> labels; //holds labels to be used
 	string label, calc, groups;
 	vector<string>  Estimators, Groups;
diff --git a/database.hpp b/database.hpp
index 01916d1..f52cdd5 100644
--- a/database.hpp
+++ b/database.hpp
@@ -52,6 +52,9 @@ public:
 	virtual int getLongestBase(); 
 	virtual void readKmerDB(ifstream&){};
 	virtual void setNumSeqs(int i) {	numSeqs = i; 	}
+	virtual vector<int> getSequencesWithKmer(int){ vector<int> filler; return filler; };  
+	virtual int getMaxKmer(){	return 1;	};
+
 	
 protected:
 	int numSeqs, longest;
diff --git a/display.h b/display.h
index 398d684..5963975 100644
--- a/display.h
+++ b/display.h
@@ -18,6 +18,8 @@ public:
 	virtual void reset() = 0;
 	virtual void close() = 0;
 	virtual bool isCalcMultiple() = 0;
+	virtual void setAll(bool){}
+	virtual bool getAll()	{	bool a; return a;	}
 	virtual string getName() { return ""; };
 	virtual ~Display() {}
 	
diff --git a/doTaxonomy.cpp b/doTaxonomy.cpp
index e664764..06337b4 100644
--- a/doTaxonomy.cpp
+++ b/doTaxonomy.cpp
@@ -59,11 +59,13 @@ void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
 		if(childPointer != tree[currentNode].children.end()){	//if the node already exists, move on
 			currentNode = childPointer->second;
 			tree[currentNode].accessions.push_back(seqName);
+			name2Taxonomy[seqName] = currentNode;
 		}
 		else{											//otherwise, create it
 			tree.push_back(TaxNode(taxon));
 			numNodes++;
 			tree[currentNode].children[taxon] = numNodes-1;
+			tree[numNodes-1].parent = currentNode;
 
 //			int numChildren = tree[currentNode].children.size();
 //			string heirarchyID = tree[currentNode].heirarchyID;
@@ -71,15 +73,30 @@ void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
 			
 			currentNode = tree[currentNode].children[taxon];
 			tree[currentNode].accessions.push_back(seqName);
-
+			name2Taxonomy[seqName] = currentNode;
 //			tree[currentNode].level = level;
 //			tree[currentNode].childNumber = numChildren;
 //			tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
 		}
-
+		
+		if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode;	}
+	}
+}
+/**************************************************************************************************/
+vector<int> PhyloTree::getGenusNodes()	{
+	try {
+		genusIndex.clear();
+		//generate genusIndexes
+		map<int, int>::iterator it2;
+		for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);	}
+		
+		return genusIndex;
+	}
+	catch(exception& e) {
+		errorOut(e, "PhyloTree", "getGenusNodes");
+		exit(1);
 	}
 }
-
 /**************************************************************************************************/
 
 void PhyloTree::assignHeirarchyIDs(int index){
@@ -92,9 +109,7 @@ void PhyloTree::assignHeirarchyIDs(int index){
 		counter++;
 		tree[it->second].level = tree[index].level + 1;
 		assignHeirarchyIDs(it->second);
-
 	}
-	
 }
 
 /**************************************************************************************************/
diff --git a/doTaxonomy.h b/doTaxonomy.h
index c1ec3ad..6901d27 100644
--- a/doTaxonomy.h
+++ b/doTaxonomy.h
@@ -1,3 +1,6 @@
+#ifndef DOTAXONOMY_H
+#define DOTAXONOMY_H
+
 /*
  *  doTaxonomy.h
  *  
@@ -12,8 +15,8 @@
 /**************************************************************************************************/
 
 struct TaxNode {
-	vector<string> accessions;
-	map<string, int> children;
+	vector<string> accessions;	//names of seqs in this branch of tree
+	map<string, int> children;  //childs name to index in tree
 	int parent, childNumber, level;
 	string name, heirarchyID;
 	
@@ -30,9 +33,17 @@ public:
 	void addSeqToTree(string, string);
 	void assignHeirarchyIDs(int);
 	void print(ofstream&);
+	vector<int> getGenusNodes();	
+	TaxNode get(int i)				{	return tree[i];	}
+	TaxNode get(string seqName)		{	return tree[name2Taxonomy[seqName]];	}
+	int getIndex(string seqName)	{	return name2Taxonomy[seqName];	}
+	string getName(int i)			{	return tree[i].name;	}
 private:
 	string getNextTaxon(string&);
 	vector<TaxNode> tree;
+	vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+	map<string, int> name2Taxonomy;  //maps name to index in tree
+	map<int, int> uniqueTaxonomies;  //map of unique taxonomies
 	void print(int, ofstream&);
 	int numNodes;
 	int numSeqs;
@@ -40,3 +51,6 @@ private:
 
 /**************************************************************************************************/
 
+#endif
+
+
diff --git a/hclustercommand.cpp b/hclustercommand.cpp
index e140f8c..2ad6333 100644
--- a/hclustercommand.cpp
+++ b/hclustercommand.cpp
@@ -53,8 +53,8 @@ HClusterCommand::HClusterCommand(string option){
 			if (namefile == "not open") { abort = true; }	
 			else if (namefile == "not found") { namefile = ""; }
 			
-			if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a cluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
-			else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+			if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a hcluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+			else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
 		
 			if (columnfile != "") {
 				if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
@@ -114,11 +114,12 @@ HClusterCommand::HClusterCommand(string option){
 
 void HClusterCommand::help(){
 	try {
-		mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
-		mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
-		mothurOut("The cluster command should be in the following format: \n");
-		mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-		mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");	
+		mothurOut("The hcluster command parameter options are cutoff, precision, method, showabund, timing, phylip, column, name and sorted. Phylip or column and name are required.\n");
+		mothurOut("The phylip and column parameter allow you to enter your distance file, and sorted indicates whether your column distance file is already sorted. \n");
+		mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+		mothurOut("The hcluster command should be in the following format: \n");
+		mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+		mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");	
 	}
 	catch(exception& e) {
 		errorOut(e, "HClusterCommand", "help");
diff --git a/kmerdb.cpp b/kmerdb.cpp
index bf8679e..2560a44 100644
--- a/kmerdb.cpp
+++ b/kmerdb.cpp
@@ -176,3 +176,36 @@ void KmerDB::readKmerDB(ifstream& kmerDBFile){
 }
 
 /**************************************************************************************************/
+int KmerDB::getCount(int kmer) {
+	try {
+		if (kmer < 0) { return 0; }  //if user gives negative number
+		else if (kmer > maxKmer) {	return 0;	}  //or a kmer that is bigger than maxkmer
+		else {	return kmerLocations[kmer].size();	}  // kmer is in vector range
+	}
+	catch(exception& e) {
+		errorOut(e, "KmerDB", "getCount");
+		exit(1);
+	}	
+}
+/**************************************************************************************************/
+vector<int> KmerDB::getSequencesWithKmer(int kmer) {
+	try {
+		
+		vector<int> seqs;
+	
+		if (kmer < 0) { }  //if user gives negative number
+		else if (kmer > maxKmer) {	}  //or a kmer that is bigger than maxkmer
+		else {	seqs = kmerLocations[kmer];	}
+		
+		return seqs;
+	}
+	catch(exception& e) {
+		errorOut(e, "KmerDB", "getSequencesWithKmer");
+		exit(1);
+	}	
+}
+
+
+/**************************************************************************************************/
+
+
diff --git a/kmerdb.hpp b/kmerdb.hpp
index b64412d..bdd9ca5 100644
--- a/kmerdb.hpp
+++ b/kmerdb.hpp
@@ -32,6 +32,9 @@ public:
 	void addSequence(Sequence);
 	vector<int> findClosestSequences(Sequence*, int);
 	void readKmerDB(ifstream&);
+	int getCount(int);  //returns number of sequences with that kmer number
+	vector<int> getSequencesWithKmer(int);  //returns vector of sequences that contain kmer passed in
+	int getMaxKmer() { return maxKmer; }
 	
 private:
 	
diff --git a/knn.cpp b/knn.cpp
index 6d57fb1..0a62e74 100644
--- a/knn.cpp
+++ b/knn.cpp
@@ -60,18 +60,7 @@ string Knn::findCommonTaxonomy(vector<string> closest)  {
 		
 			string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
 		
-			tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
-	
-			//parse taxonomy
-			string individual;
-			while (tax.find_first_of(';') != -1) {
-				individual = tax.substr(0,tax.find_first_of(';'));
-				tax = tax.substr(tax.find_first_of(';')+1, tax.length());
-				taxons[i].push_back(individual);
-			
-			}
-			//get last one
-			taxons[i].push_back(tax);
+			taxons[i] = parseTax(tax);
 		
 			//figure out who has the shortest taxonomy info. so you can start comparing there
 			if (taxons[i].size() < smallest) {
diff --git a/mothur.h b/mothur.h
index 23b9f18..ba88f71 100644
--- a/mothur.h
+++ b/mothur.h
@@ -70,6 +70,9 @@ struct IntNode {
 	int rcoef;
 	IntNode* left;
 	IntNode* right;
+	
+	IntNode(int lv, int rv, IntNode* l, IntNode* r) : lvalue(lv), rvalue(rv), left(l), right(r) {};
+	IntNode() {};
 };
 
 struct ThreadNode {
diff --git a/phylotype.cpp b/phylotype.cpp
deleted file mode 100644
index f7af97a..0000000
--- a/phylotype.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  phylotype.cpp
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "phylotype.h"
-
-/**************************************************************************************************/
-PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) 
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch)  {}
-/**************************************************************************************************/
-string PhyloType::getTaxonomy(Sequence* seq) {
-	try {
-		string tax;
-		
-		//use database to find closest seq
-		vector<int> closest = database->findClosestSequences(seq, 1);
-		
-		//find that sequences taxonomy in map
-		it = taxonomy.find(names[closest[0]]);
-		
-		//is this sequence in the taxonomy file
-		if (it == taxonomy.end()) { //error not in file
-			mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file.  It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine();
-			tax = "bad seq";
-		}else {
-			tax = it->second;
-		}
-		
-		return tax;	
-	}
-	catch(exception& e) {
-		errorOut(e, "PhyloType", "getTaxonomy");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
diff --git a/phylotype.h b/phylotype.h
deleted file mode 100644
index ee649e3..0000000
--- a/phylotype.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef PHYLOTYPE_H
-#define PHYLOTYPE_H
-
-/*
- *  phylotype.h
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "mothur.h"
-#include "classify.h"
-
-/**************************************************************************************************/
-
-class PhyloType : public Classify {
-	
-public:
-	PhyloType(string, string, string, int, int, int, int, int);
-	~PhyloType() {};
-	
-	string getTaxonomy(Sequence*);
-	
-	
-private:
-	
-};
-
-/**************************************************************************************************/
-
-#endif
-
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp
index 82a1d24..4eafaf8 100644
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -120,6 +120,10 @@ int SeqSummaryCommand::execute(){
 		int ptile97_5	= int(numSeqs * 0.975);
 		int ptile100	= numSeqs - 1;
 		
+		//to compensate for blank sequences that would result in startPosition and endPostion equalling -1
+		if (startPosition[0] == -1) {  startPosition[0] = 0;	}
+		if (endPosition[0] == -1)	{  endPosition[0] = 0;		}
+		
 		mothurOutEndLine();
 		mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); mothurOutEndLine();
 		mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
diff --git a/sharedchao1.cpp b/sharedchao1.cpp
index a28f609..c19f032 100644
--- a/sharedchao1.cpp
+++ b/sharedchao1.cpp
@@ -27,8 +27,7 @@ EstOutput SharedChao1::getValues(vector<SharedRAbundVector*> shared){
 		// the coeffient is 2.  Note: we only set the coeffient in f2 values.
 		
 		//create and initialize trees to 0.
-		initialTree(numGroups);	
-		
+		initialTree(numGroups);
 		
 		for (int i = 0; i < shared[0]->size(); i++) {
 			//get bin values and calc shared 
@@ -46,7 +45,7 @@ EstOutput SharedChao1::getValues(vector<SharedRAbundVector*> shared){
 			}
 		}
 
-			
+		
 		//calculate chao1, (numleaves-1) because numleaves contains the ++ values.
 		bool bias = false;
 		for(int i=0;i<numLeaves;i++){
@@ -108,39 +107,30 @@ void SharedChao1::initialTree(int n) {
 		
 		f1leaves.resize(numNodes);
 		f2leaves.resize(numNodes);
-		
+	
 		//initialize leaf values
 		for (int i = 0; i < numLeaves; i++) {
-			f1leaves[i] = new IntNode;
-			f1leaves[i]->lvalue = 0;
-			f1leaves[i]->rvalue = 0;
-			f1leaves[i]->left = NULL;
-			f1leaves[i]->right = NULL;
-			
-			f2leaves[i] = new IntNode;
-			f2leaves[i]->lvalue = 0;
-			f2leaves[i]->rvalue = 0;
-			f2leaves[i]->left = NULL;
-			f2leaves[i]->right = NULL;
+			f1leaves[i] = new IntNode(0, 0, NULL, NULL);
+			f2leaves[i] = new IntNode(0, 0, NULL, NULL);
 		}
 		
 		//set pointers to children
 		for (int j = numLeaves; j < numNodes; j++) {
-			f1leaves[j] = new IntNode;
+			f1leaves[j] = new IntNode();
 			f1leaves[j]->left = f1leaves[countleft];
 			f1leaves[j]->right = f1leaves[countright];
 						
-			f2leaves[j] = new IntNode;
+			f2leaves[j] = new IntNode();
 			f2leaves[j]->left = f2leaves[countleft];
 			f2leaves[j]->right =f2leaves[countright];
-						
+			
 			countleft = countleft + 2;
 			countright = countright + 2;
 		}
 		
 		//point to root
 		f1root = f1leaves[numNodes-1];
-		
+	
 		//point to root
 		f2root = f2leaves[numNodes-1];
 		
@@ -148,6 +138,7 @@ void SharedChao1::initialTree(int n) {
 		setCoef(f2root, 0);
 	}
 	catch(exception& e) {
+		if ((toString(e.what()) == "vector::_M_fill_insert") || (toString(e.what()) == "St9bad_alloc")) { mothurOut("You are using " + toString(n) + " groups which creates 2^" + toString(n+1) + " nodes. Try reducing the number of groups you selected. "); mothurOutEndLine(); exit(1); }
 		errorOut(e, "SharedChao1", "initialTree");
 		exit(1);
 	}
diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp
index 575c0de..65fcc9a 100644
--- a/summarysharedcommand.cpp
+++ b/summarysharedcommand.cpp
@@ -48,7 +48,7 @@ SummarySharedCommand::SummarySharedCommand(string option){
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"label","calc","groups"};
+			string Array[] =  {"label","calc","groups","all"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -95,6 +95,9 @@ SummarySharedCommand::SummarySharedCommand(string option){
 				globaldata->Groups = Groups;
 			}
 			
+			string temp = validParameter.validFile(parameters, "all", false);				if (temp == "not found") { temp = "true"; }
+			all = isTrue(temp);
+			
 			if (abort == false) {
 			
 				validCalculator = new ValidCalculators();
@@ -165,7 +168,7 @@ SummarySharedCommand::SummarySharedCommand(string option){
 void SummarySharedCommand::help(){
 	try {
 		mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
-		mothurOut("The summary.shared command parameters are label and calc.  No parameters are required.\n");
+		mothurOut("The summary.shared command parameters are label, calc and all.  No parameters are required.\n");
 		mothurOut("The summary.shared command should be in the following format: \n");
 		mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
 		mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
@@ -173,6 +176,8 @@ void SummarySharedCommand::help(){
 		mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
 		mothurOut("The default value for groups is all the groups in your groupfile.\n");
 		mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+		mothurOut("The all parameter is used to specify if you want the estimate of all your groups together.  This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+		mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
 		mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n");
 		mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
 	}
@@ -202,8 +207,10 @@ int SummarySharedCommand::execute(){
 		if (sumCalculators.size() == 0) { return 0; }
 		//check if any calcs can do multiples
 		else{
-			for (int i = 0; i < sumCalculators.size(); i++) {
-				if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+			if (all){ 
+				for (int i = 0; i < sumCalculators.size(); i++) {
+					if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+				}
 			}
 		}
 		
diff --git a/summarysharedcommand.h b/summarysharedcommand.h
index 63197e1..090c9f4 100644
--- a/summarysharedcommand.h
+++ b/summarysharedcommand.h
@@ -34,7 +34,7 @@ private:
 	InputData* input;
 	ValidCalculators* validCalculator;
 	
-	bool abort, allLines, mult;
+	bool abort, allLines, mult, all;
 	set<string> labels; //holds labels to be used
 	string label, calc, groups;
 	vector<string>  Estimators, Groups;
-- 
2.39.5