From 6f4b9401f7deb8aaf0d87659298308f4138cc3b0 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Mon, 26 Apr 2010 15:05:21 +0000
Subject: [PATCH] made classifier faster

---
 Mothur.xcodeproj/project.pbxproj |   4 +
 bayesian.cpp                     |  78 +++--
 bellerophon.cpp                  |  25 +-
 chimerabellerophoncommand.cpp    |   2 +-
 chimerapintailcommand.cpp        |   7 +-
 chimeraslayercommand.cpp         |   4 +-
 classify.cpp                     | 550 ++++++++++++++++---------------
 classify.h                       |   8 +-
 classifyseqscommand.cpp          |   8 +-
 getoturepcommand.cpp             |   4 +-
 knn.cpp                          |  12 +-
 makefile                         |  16 +-
 phylotree.cpp                    |  88 ++++-
 phylotree.h                      |   6 +
 rawtrainingdatamaker.cpp         | 188 +++++++++++
 rawtrainingdatamaker.h           |  54 +++
 readdistcommand.cpp              |   2 +-
 17 files changed, 728 insertions(+), 328 deletions(-)
 create mode 100644 rawtrainingdatamaker.cpp
 create mode 100644 rawtrainingdatamaker.h
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 79c1e5b..805bde0 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -12,6 +12,8 @@
 		A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
 		A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
 		A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
+		A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
+		A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
 		A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
 		A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
 		A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
@@ -829,6 +831,8 @@
 				A7DA208A113FECD400BF472F /* knn.h */,
 				A7DA20C4113FECD400BF472F /* phylotree.cpp */,
 				A7DA20C5113FECD400BF472F /* phylotree.h */,
+				A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
+				A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
 				A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
 				A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
 			);
diff --git a/bayesian.cpp b/bayesian.cpp
index b007c00..0929749 100644
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -9,29 +9,17 @@
 
 #include "bayesian.h"
 #include "kmer.hpp"
+#include "rawtrainingdatamaker.h"
 
 /**************************************************************************************************/
 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
+Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 	try {
 					
-		numKmers = database->getMaxKmer() + 1;
-		
-		//initialze probabilities
-		wordGenusProb.resize(numKmers);
-		
-		genusNodes = phyloTree->getGenusNodes(); 
-		
-		for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
-			
-		//reset counts because we are on a new word
-		for (int j = 0; j < genusNodes.size(); j++) {
-			TaxNode temp = phyloTree->get(genusNodes[j]);
-			genusTotals.push_back(temp.accessions.size());
-		}
-
-		
 		/************calculate the probablity that each word will be in a specific taxonomy*************/
+		string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
+		ifstream phyloTreeTest(phyloTreeName.c_str());
+		
 		ofstream out;
 		string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
 		ifstream probFileTest(probFileName.c_str());
@@ -42,15 +30,47 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c
 		
 		int start = time(NULL);
 		
-		if(probFileTest && probFileTest2){	
+		if(probFileTest && probFileTest2 && phyloTreeTest){	
+			m->mothurOut("Reading template taxonomy...     "); cout.flush();
+			
+			phyloTree = new PhyloTree(phyloTreeTest);
+	
+			m->mothurOut("DONE."); m->mothurOutEndLine();
+			
+			genusNodes = phyloTree->getGenusNodes(); 
+			genusTotals = phyloTree->getGenusTotals();
+		
 			m->mothurOut("Reading template probabilities...     "); cout.flush();
 			readProbFile(probFileTest, probFileTest2);	
+			
 		}else{
+		
+			//create search database and names vector
+			generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
+			
+			genusNodes = phyloTree->getGenusNodes(); 
+			genusTotals = phyloTree->getGenusTotals();
+			
+			m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
+			
+			phyloTree->printTreeNodes(phyloTreeName);
+						
+			m->mothurOut("DONE."); m->mothurOutEndLine();
+			
 			m->mothurOut("Calculating template probabilities...     "); cout.flush();
+			
+			numKmers = database->getMaxKmer() + 1;
+		
+			//initialze probabilities
+			wordGenusProb.resize(numKmers);
+		
+			for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
 
 			ofstream out;
 			openOutputFile(probFileName, out);
 			
+			out << numKmers << endl;
+			
 			ofstream out2;
 			openOutputFile(probFileName2, out2);
 			
@@ -86,9 +106,14 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c
 			
 			out.close();
 			out2.close();
+			
+			//read in new phylotree with less info. - its faster
+			ifstream phyloTreeTest(phyloTreeName.c_str());
+			delete phyloTree;
+			
+			phyloTree = new PhyloTree(phyloTreeTest);
 		}
-		
-		
+	
 		m->mothurOut("DONE."); m->mothurOutEndLine();
 		m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
 	}
@@ -113,14 +138,12 @@ string Bayesian::getTaxonomy(Sequence* seq) {
 		for (int i = 0; i < queryKmerString.length(); i++) {
 			if (queryKmerString[i] != '!') { //this kmer is in the query
 				queryKmers.push_back(i);
- 
 			}
 		}
 		
 		if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; }
 		
 		int index = getMostProbableTaxonomy(queryKmers);
-
 		
 		if (m->control_pressed) { return tax; }
 					
@@ -272,18 +295,25 @@ map<string, int> Bayesian::parseTaxMap(string newTax) {
 void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
 	try{
 		
+		in >> numKmers; gobble(in);
+		
+		//initialze probabilities
+		wordGenusProb.resize(numKmers);
+		
+		for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
+		
 		int kmer, name, count;  count = 0;
 		vector<int> num; num.resize(numKmers);
 		float prob;
 		vector<float> zeroCountProb; zeroCountProb.resize(numKmers);		
-		
+	
 		while (inNum) {
 			inNum >> zeroCountProb[count] >> num[count];  
 			count++;
 			gobble(inNum);
 		}
 		inNum.close();
-		
+	
 		while(in) {
 			in >> kmer;
 			
diff --git a/bellerophon.cpp b/bellerophon.cpp
index e33230e..54025eb 100644
--- a/bellerophon.cpp
+++ b/bellerophon.cpp
@@ -132,7 +132,7 @@ int Bellerophon::print(ostream& out, ostream& outAcc) {
 //***************************************************************************************************************
 int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
 	try {
-		
+	
 		int pid;
 		MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 		
@@ -148,17 +148,26 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
 
 			if (m->control_pressed) { return numSeqs; }
 			
-			outString += "Name\tScore\tLeft\tRight\n";
+			outString = "Name\tScore\tLeft\tRight\n";
+			MPI_Status status;
+			int olength = outString.length();
+			char* buf5 = new char[olength];
+			memcpy(buf5, outString.c_str(), olength);
+					
+			MPI_File_write_shared(out, buf5, olength, MPI_CHAR, &status);
+			
+			delete buf5;
+
 			//output prefenence structure to .chimeras file
 			for (int i = 0; i < best.size(); i++) {
 				
 				if (m->control_pressed) {  return numSeqs; }
 				
-				outString += best[i].name + "\t" +  toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
-				
+				outString = best[i].name + "\t" +  toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
+			
 				MPI_Status status;
 				int length = outString.length();
-				char* buf2 = new char[length];
+				char* buf2 = new char[length];
 				memcpy(buf2, outString.c_str(), length);
 					
 				MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
@@ -168,12 +177,12 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
 				//calc # of seqs with preference above 95%tile
 				if (best[i].score >= cutoffScore) { 
 					above1++; 
-					string outAccString;
+					string outAccString = "";
 					 outAccString += best[i].name + "\n";
 					
 					MPI_Status statusAcc;
 					length = outAccString.length();
-					char* buf = new char[length];
+					char* buf = new char[length];
 					memcpy(buf, outAccString.c_str(), length);
 					
 					MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
@@ -291,7 +300,7 @@ int Bellerophon::getChimeras() {
 				if (m->control_pressed) { return 0; }
 				
 				int bestLength = MPIBestSend[i].length();
-				char* buf = new char[bestLength];
+				char* buf = new char[bestLength];
 				memcpy(buf, MPIBestSend[i].c_str(), bestLength);
 				
 				MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD);
diff --git a/chimerabellerophoncommand.cpp b/chimerabellerophoncommand.cpp
index 67f6b57..668956f 100644
--- a/chimerabellerophoncommand.cpp
+++ b/chimerabellerophoncommand.cpp
@@ -90,7 +90,7 @@ void ChimeraBellerophonCommand::help(){
 		m->mothurOut("The chimera.bellerophon command reads a fastafile and creates list of potentially chimeric sequences.\n");
 		m->mothurOut("The chimera.bellerophon command parameters are fasta, filter, correction, processors, window, increment. The fasta parameter is required.\n");
 		m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter, default=false. \n");
-		m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
+		m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs, default=true.\n");
 		m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
 		#ifdef USE_MPI
 		m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp
index 384c885..62f1f2b 100644
--- a/chimerapintailcommand.cpp
+++ b/chimerapintailcommand.cpp
@@ -147,7 +147,7 @@ void ChimeraPintailCommand::help(){
 		#ifdef USE_MPI
 		m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 		#endif
-		m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n");
+		m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
 		m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
 		m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
 		m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
@@ -175,12 +175,11 @@ int ChimeraPintailCommand::execute(){
 		
 		int start = time(NULL);	
 		
-		chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
-		
 		//set user options
 		if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine();  }
 		
-
+		chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
+		
 		string outputFileName, accnosFileName;
 		if (maskfile != "") {
 			outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp
index 8bb8c2c..58aecf5 100644
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -137,14 +137,13 @@ void ChimeraSlayerCommand::help(){
 	
 		m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
 		m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n");
-		m->mothurOut("The chimera.slayer command parameters are fasta, template, filter, mask, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
+		m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
 		m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
 		m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
 		m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
 		#ifdef USE_MPI
 		m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 		#endif
-		m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n");
 		m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=50. \n");
 		m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=5.\n");
 		m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with, default=15.\n");
@@ -160,7 +159,6 @@ void ChimeraSlayerCommand::help(){
 		m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
 		m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n");
 		//m->mothurOut("The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. Found to make results worse. \n");
-		m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
 		m->mothurOut("The chimera.slayer command should be in the following format: \n");
 		m->mothurOut("chimera.slayer(fasta=yourFastaFile, template=yourTemplate, search=yourSearch) \n");
 		m->mothurOut("Example: chimera.slayer(fasta=AD.align, template=core_set_aligned.imputed.fasta, search=kmer) \n");
diff --git a/classify.cpp b/classify.cpp
index c3e52ab..be287a0 100644
--- a/classify.cpp
+++ b/classify.cpp
@@ -1,272 +1,278 @@
-/*
- *  classify.cpp
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "classify.h"
-#include "sequence.hpp"
-#include "kmerdb.hpp"
-#include "suffixdb.hpp"
-#include "blastdb.hpp"
-#include "distancedb.hpp"
-
-/**************************************************************************************************/
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {		
-	try {	
-		m = MothurOut::getInstance();									
-		readTaxonomy(taxFile);	
-		
-		int start = time(NULL);
-		int numSeqs = 0;
-		
-		m->mothurOut("Generating search database...    "); cout.flush();
-#ifdef USE_MPI	
-			int pid;
-			vector<long> positions;
-		
-			MPI_Status status; 
-			MPI_File inMPI;
-			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
-			//char* inFileName = new char[tempFile.length()];
-			//memcpy(inFileName, tempFile.c_str(), tempFile.length());
-			
-			char inFileName[1024];
-			strcpy(inFileName, tempFile.c_str());
-
-			MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-			//delete inFileName;
-
-			if (pid == 0) { //only one process needs to scan file
-				positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
-
-				//send file positions to all processes
-				MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-				MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
-			}else{
-				MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-				positions.resize(numSeqs);
-				MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-			}
-			
-			//create database
-			if(method == "kmer")			{	database = new KmerDB(tempFile, kmerSize);			}
-			else if(method == "suffix")		{	database = new SuffixDB(numSeqs);								}
-			else if(method == "blast")		{	database = new BlastDB(gapOpen, gapExtend, match, misMatch);	}
-			else if(method == "distance")	{	database = new DistanceDB();	}
-			else {
-				m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
-				database = new KmerDB(tempFile, 8);
-			}
-
-			//read file 
-			for(int i=0;i<numSeqs;i++){
-				//read next sequence
-				int length = positions[i+1] - positions[i];
-				char* buf4 = new char[length];
-				MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-				
-				string tempBuf = buf4;
-				if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-				delete buf4;
-				istringstream iss (tempBuf,istringstream::in);
-				
-				Sequence temp(iss);  
-				if (temp.getName() != "") {
-					names.push_back(temp.getName());
-					database->addSequence(temp);	
-				}
-			}
-			
-			database->generateDB();
-			MPI_File_close(&inMPI);
-	#else
-		
-		//need to know number of template seqs for suffixdb
-		if (method == "suffix") {
-			ifstream inFASTA;
-			openInputFile(tempFile, inFASTA);
-			numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
-			inFASTA.close();
-		}
-
-		bool needToGenerate = true;
-		string kmerDBName;
-		if(method == "kmer")			{	
-			database = new KmerDB(tempFile, kmerSize);			
-			
-			kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
-			ifstream kmerFileTest(kmerDBName.c_str());
-			if(kmerFileTest){	needToGenerate = false;		}
-		}
-		else if(method == "suffix")		{	database = new SuffixDB(numSeqs);								}
-		else if(method == "blast")		{	database = new BlastDB(gapOpen, gapExtend, match, misMatch);	}
-		else if(method == "distance")	{	database = new DistanceDB();	}
-		else {
-			m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
-			m->mothurOutEndLine();
-			database = new KmerDB(tempFile, 8);
-		}
-		
-		if (needToGenerate) {
-			ifstream fastaFile;
-			openInputFile(tempFile, fastaFile);
-			
-			while (!fastaFile.eof()) {
-				Sequence temp(fastaFile);
-				gobble(fastaFile);
-			
-				names.push_back(temp.getName());
-								
-				database->addSequence(temp);	
-			}
-			fastaFile.close();
-
-			database->generateDB();
-			
-		}else if ((method == "kmer") && (!needToGenerate)) {	
-			ifstream kmerFileTest(kmerDBName.c_str());
-			database->readKmerDB(kmerFileTest);	
-			
-			ifstream fastaFile;
-			openInputFile(tempFile, fastaFile);
-			
-			while (!fastaFile.eof()) {
-				Sequence temp(fastaFile);
-				gobble(fastaFile);
-			
-				names.push_back(temp.getName());
-			}
-			fastaFile.close();
-		}
-#endif		
-		database->setNumSeqs(names.size());
-		
-		m->mothurOut("DONE."); m->mothurOutEndLine();
-		m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
-
-	}
-	catch(exception& e) {
-		m->errorOut(e, "Classify", "Classify");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-void Classify::readTaxonomy(string file) {
-	try {
-		
-		phyloTree = new PhyloTree();
-		string name, taxInfo;
-		
-		m->mothurOutEndLine();
-		m->mothurOut("Reading in the " + file + " taxonomy...\t");	cout.flush();
-
-#ifdef USE_MPI	
-		int pid, num;
-		vector<long> positions;
-		
-		MPI_Status status; 
-		MPI_File inMPI;
-		MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
-		//char* inFileName = new char[file.length()];
-		//memcpy(inFileName, file.c_str(), file.length());
-		
-		char inFileName[1024];
-		strcpy(inFileName, file.c_str());
-
-		MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-		//delete inFileName;
-
-		if (pid == 0) {
-			positions = setFilePosEachLine(file, num);
-			
-			//send file positions to all processes
-			MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-			MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
-		}else{
-			MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-			positions.resize(num);
-			MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-		}
-	
-		//read file 
-		for(int i=0;i<num;i++){
-			//read next sequence
-			int length = positions[i+1] - positions[i];
-			char* buf4 = new char[length];
-
-			MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-
-			string tempBuf = buf4;
-			if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-			delete buf4;
-
-			istringstream iss (tempBuf,istringstream::in);
-			iss >> name >> taxInfo;
-			taxonomy[name] = taxInfo;
-			phyloTree->addSeqToTree(name, taxInfo);
-		}
-		
-		MPI_File_close(&inMPI);
-#else				
-		ifstream inTax;
-		openInputFile(file, inTax);
-	
-		//read template seqs and save
-		while (!inTax.eof()) {
-			inTax >> name >> taxInfo;
-			
-			taxonomy[name] = taxInfo;
-			
-			phyloTree->addSeqToTree(name, taxInfo);
-		
-			gobble(inTax);
-		}
-		inTax.close();
-#endif	
-	
-		phyloTree->assignHeirarchyIDs(0);
-		
-		m->mothurOut("DONE.");
-		m->mothurOutEndLine();	cout.flush();
-	
-	}
-	catch(exception& e) {
-		m->errorOut(e, "Classify", "readTaxonomy");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-vector<string> Classify::parseTax(string tax) {
-	try {
-		vector<string> taxons;
-		
-		tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
-	
-		//parse taxonomy
-		string individual;
-		while (tax.find_first_of(';') != -1) {
-			individual = tax.substr(0,tax.find_first_of(';'));
-			tax = tax.substr(tax.find_first_of(';')+1, tax.length());
-			taxons.push_back(individual);
-			
-		}
-		//get last one
-		taxons.push_back(tax);
-		
-		return taxons;
-	}
-	catch(exception& e) {
-		m->errorOut(e, "Classify", "parseTax");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
+/*
+ *  classify.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+/**************************************************************************************************/
+void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {		
+	try {	
+		taxFile = tfile;
+		readTaxonomy(taxFile);	
+		
+		templateFile = tempFile;	
+		
+		int start = time(NULL);
+		int numSeqs = 0;
+		
+		m->mothurOut("Generating search database...    "); cout.flush();
+#ifdef USE_MPI	
+			int pid;
+			vector<long> positions;
+		
+			MPI_Status status; 
+			MPI_File inMPI;
+			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+			//char* inFileName = new char[tempFile.length()];
+			//memcpy(inFileName, tempFile.c_str(), tempFile.length());
+			
+			char inFileName[1024];
+			strcpy(inFileName, tempFile.c_str());
+
+			MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+			//delete inFileName;
+
+			if (pid == 0) { //only one process needs to scan file
+				positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+
+				//send file positions to all processes
+				MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+				MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
+			}else{
+				MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+				positions.resize(numSeqs);
+				MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+			}
+			
+			//create database
+			if(method == "kmer")			{	database = new KmerDB(tempFile, kmerSize);			}
+			else if(method == "suffix")		{	database = new SuffixDB(numSeqs);								}
+			else if(method == "blast")		{	database = new BlastDB(gapOpen, gapExtend, match, misMatch);	}
+			else if(method == "distance")	{	database = new DistanceDB();	}
+			else {
+				m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+				database = new KmerDB(tempFile, 8);
+			}
+
+			//read file 
+			for(int i=0;i<numSeqs;i++){
+				//read next sequence
+				int length = positions[i+1] - positions[i];
+				char* buf4 = new char[length];
+				MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+				
+				string tempBuf = buf4;
+				if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+				delete buf4;
+				istringstream iss (tempBuf,istringstream::in);
+				
+				Sequence temp(iss);  
+				if (temp.getName() != "") {
+					names.push_back(temp.getName());
+					database->addSequence(temp);	
+				}
+			}
+			
+			database->generateDB();
+			MPI_File_close(&inMPI);
+	#else
+		
+		//need to know number of template seqs for suffixdb
+		if (method == "suffix") {
+			ifstream inFASTA;
+			openInputFile(tempFile, inFASTA);
+			numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+			inFASTA.close();
+		}
+
+		bool needToGenerate = true;
+		string kmerDBName;
+		if(method == "kmer")			{	
+			database = new KmerDB(tempFile, kmerSize);			
+			
+			kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+			ifstream kmerFileTest(kmerDBName.c_str());
+			if(kmerFileTest){	needToGenerate = false;		}
+		}
+		else if(method == "suffix")		{	database = new SuffixDB(numSeqs);								}
+		else if(method == "blast")		{	database = new BlastDB(gapOpen, gapExtend, match, misMatch);	}
+		else if(method == "distance")	{	database = new DistanceDB();	}
+		else {
+			m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+			m->mothurOutEndLine();
+			database = new KmerDB(tempFile, 8);
+		}
+		
+		if (needToGenerate) {
+			ifstream fastaFile;
+			openInputFile(tempFile, fastaFile);
+			
+			while (!fastaFile.eof()) {
+				Sequence temp(fastaFile);
+				gobble(fastaFile);
+			
+				names.push_back(temp.getName());
+								
+				database->addSequence(temp);	
+			}
+			fastaFile.close();
+
+			database->generateDB();
+			
+		}else if ((method == "kmer") && (!needToGenerate)) {	
+			ifstream kmerFileTest(kmerDBName.c_str());
+			database->readKmerDB(kmerFileTest);	
+			
+			ifstream fastaFile;
+			openInputFile(tempFile, fastaFile);
+			
+			while (!fastaFile.eof()) {
+				Sequence temp(fastaFile);
+				gobble(fastaFile);
+
+				names.push_back(temp.getName());
+			}
+			fastaFile.close();
+		}
+#endif		
+		database->setNumSeqs(names.size());
+		
+		m->mothurOut("DONE."); m->mothurOutEndLine();
+		m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Classify", "generateDatabaseAndNames");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+Classify::Classify() {		m = MothurOut::getInstance();	database = NULL;	}
+/**************************************************************************************************/
+
+int Classify::readTaxonomy(string file) {
+	try {
+		
+		phyloTree = new PhyloTree();
+		string name, taxInfo;
+		
+		m->mothurOutEndLine();
+		m->mothurOut("Reading in the " + file + " taxonomy...\t");	cout.flush();
+
+#ifdef USE_MPI	
+		int pid, num;
+		vector<long> positions;
+		
+		MPI_Status status; 
+		MPI_File inMPI;
+		MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+		//char* inFileName = new char[file.length()];
+		//memcpy(inFileName, file.c_str(), file.length());
+		
+		char inFileName[1024];
+		strcpy(inFileName, file.c_str());
+
+		MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+		//delete inFileName;
+
+		if (pid == 0) {
+			positions = setFilePosEachLine(file, num);
+			
+			//send file positions to all processes
+			MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+			MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
+		}else{
+			MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+			positions.resize(num);
+			MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+		}
+	
+		//read file 
+		for(int i=0;i<num;i++){
+			//read next sequence
+			int length = positions[i+1] - positions[i];
+			char* buf4 = new char[length];
+
+			MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+			string tempBuf = buf4;
+			if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+			delete buf4;
+
+			istringstream iss (tempBuf,istringstream::in);
+			iss >> name >> taxInfo;
+			taxonomy[name] = taxInfo;
+			phyloTree->addSeqToTree(name, taxInfo);
+		}
+		
+		MPI_File_close(&inMPI);
+#else				
+		ifstream inTax;
+		openInputFile(file, inTax);
+	
+		//read template seqs and save
+		while (!inTax.eof()) {
+			inTax >> name >> taxInfo;
+			
+			taxonomy[name] = taxInfo;
+			
+			phyloTree->addSeqToTree(name, taxInfo);
+		
+			gobble(inTax);
+		}
+		inTax.close();
+#endif	
+	
+		phyloTree->assignHeirarchyIDs(0);
+		
+		m->mothurOut("DONE.");
+		m->mothurOutEndLine();	cout.flush();
+		
+		return phyloTree->getNumSeqs();
+	
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Classify", "readTaxonomy");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+vector<string> Classify::parseTax(string tax) {
+	try {
+		vector<string> taxons;
+		
+		tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
+	
+		//parse taxonomy
+		string individual;
+		while (tax.find_first_of(';') != -1) {
+			individual = tax.substr(0,tax.find_first_of(';'));
+			tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+			taxons.push_back(individual);
+			
+		}
+		//get last one
+		taxons.push_back(tax);
+		
+		return taxons;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Classify", "parseTax");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
diff --git a/classify.h b/classify.h
index 1a34362..a2401dd 100644
--- a/classify.h
+++ b/classify.h
@@ -26,13 +26,14 @@ class Sequence;
 class Classify {
 
 public:
-	Classify(string, string, string, int, float, float, float, float);
+	Classify();
 	
-	virtual ~Classify(){  delete phyloTree; delete database;  };
+	virtual ~Classify(){  delete phyloTree; if (database != NULL) {  delete database; } };
 	virtual string getTaxonomy(Sequence*) = 0;
 	//virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
 	//virtual vector<string> parseTax(string);
 	virtual string getSimpleTax()  { return simpleTax;	}
+	virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float);
 	
 protected:
 
@@ -46,9 +47,10 @@ protected:
 	string taxFile, templateFile, simpleTax;
 	vector<string> names;
 	
-	void readTaxonomy(string);
+	int readTaxonomy(string);
 	vector<string> parseTax(string);
 	MothurOut* m;
+	
 };
 
 /**************************************************************************************************/
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index bed6580..a38bf8f 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -487,6 +487,10 @@ int ClassifySeqsCommand::execute(){
 			if (pid == 0) {  //this part does not need to be paralellized
 		#endif
 
+			m->mothurOutEndLine();
+			m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+			start = time(NULL);
+			
 			//make taxonomy tree from new taxonomy file 
 			PhyloTree taxaBrowser;
 			
@@ -570,8 +574,8 @@ int ClassifySeqsCommand::execute(){
 			m->mothurOutEndLine();
 
 			
-			m->mothurOutEndLine();
-			m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+			//m->mothurOutEndLine();
+			//m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
 		}
 		
 		delete classify;
diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp
index e9373a9..2c62efd 100644
--- a/getoturepcommand.cpp
+++ b/getoturepcommand.cpp
@@ -327,7 +327,7 @@ int GetOTURepCommand::execute(){
 		
 		if (m->control_pressed) { 
 			if (large) {  inRow.close(); remove(distFile.c_str());  }
-			delete groupMap; delete fasta; return 0; 
+			delete fasta; return 0; 
 		}
 				
 		//if user gave a namesfile then use it
@@ -350,7 +350,7 @@ int GetOTURepCommand::execute(){
 		
 		if (m->control_pressed) { 
 			if (large) {  inRow.close(); remove(distFile.c_str());  }
-			delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
+			delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
 		}
 	
 		while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
diff --git a/knn.cpp b/knn.cpp
index 93a7aa3..64b5b3a 100644
--- a/knn.cpp
+++ b/knn.cpp
@@ -11,14 +11,22 @@
 
 /**************************************************************************************************/
 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n) 
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n)  {}
+: Classify(), num(n)  {
+	try {
+		//create search database and names vector
+		generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Knn", "Knn");
+		exit(1);
+	}
+}
 /**************************************************************************************************/
 string Knn::getTaxonomy(Sequence* seq) {
 	try {
 		string tax;
 		
 		//use database to find closest seq
-
 		vector<int> closest = database->findClosestSequences(seq, num);
 		
 		if (m->control_pressed) { return tax; }
diff --git a/makefile b/makefile
index d62f029..5812d45 100644
--- a/makefile
+++ b/makefile
@@ -26,7 +26,7 @@ ifeq  ($(strip $(USEREADLINE)),yes)
       -L../readline-6.0
 endif
 
-USEMPI ?= yes
+USEMPI ?= no
 
 ifeq  ($(strip $(USEMPI)),yes)
     CC_OPTIONS += -DUSE_MPI
@@ -219,7 +219,8 @@ mothur : \
 		./parsesffcommand.o\
 		./classify.o\
 		./phylotree.o\
-		./bayesian.o\
+		./bayesian.o\
+		./rawtrainingdatamaker.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -416,7 +417,8 @@ mothur : \
 		./parsesffcommand.o\
 		./classify.o\
 		./phylotree.o\
-		./bayesian.o\
+		./bayesian.o\
+		./rawtrainingdatamaker.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -616,7 +618,8 @@ clean :
 		./parsesffcommand.o\
 		./classify.o\
 		./phylotree.o\
-		./bayesian.o\
+		./bayesian.o\
+		./rawtrainingdatamaker.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -1619,6 +1622,11 @@ install : mothur
 # Item # 196 -- chimerabellerophoncommand --
 ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp
 	$(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o
+
+# Item # 171 -- rawtrainingdatamaker --
+./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp
+	$(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o
+
 
 
 ##### END RUN ####
diff --git a/phylotree.cpp b/phylotree.cpp
index 3cdd6ac..33aedc6 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -19,6 +19,44 @@ PhyloTree::PhyloTree(){
 		tree.push_back(TaxNode("Root"));
 		tree[0].heirarchyID = "0";
 		maxLevel = 0;
+		calcTotals = true;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloTree", "PhyloTree");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+PhyloTree::PhyloTree(ifstream& in){
+	try {
+		m = MothurOut::getInstance();
+		calcTotals = false;
+		
+		in >> numNodes; gobble(in);
+		
+		tree.resize(numNodes);
+		
+		for (int i = 0; i < tree.size(); i++) {
+			in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+ 
+		}
+		
+		//read genus nodes
+		int numGenus = 0;
+		in >> numGenus; gobble(in);
+	
+		int gnode, gsize;
+		totals.clear();
+		for (int i = 0; i < numGenus; i++) {
+			in >> gnode >> gsize; gobble(in);
+			
+			uniqueTaxonomies[gnode] = gnode;
+			totals.push_back(gsize);
+		}
+			
+		in.close();
+		
 	}
 	catch(exception& e) {
 		m->errorOut(e, "PhyloTree", "PhyloTree");
@@ -35,6 +73,7 @@ PhyloTree::PhyloTree(string tfile){
 		tree.push_back(TaxNode("Root"));
 		tree[0].heirarchyID = "0";
 		maxLevel = 0;
+		calcTotals = true;
 		
 		ifstream in;
 		openInputFile(tfile, in);
@@ -149,6 +188,27 @@ vector<int> PhyloTree::getGenusNodes()	{
 	}
 }
 /**************************************************************************************************/
+vector<int> PhyloTree::getGenusTotals()	{
+	try {
+	
+		if (calcTotals) {
+			totals.clear();
+			//reset counts because we are on a new word
+			for (int j = 0; j < genusIndex.size(); j++) {
+				totals.push_back(tree[genusIndex[j]].accessions.size());
+			}
+			return totals;
+		}else{
+			return totals;
+		}
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloTree", "getGenusNodes");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
 
 void PhyloTree::assignHeirarchyIDs(int index){
 	try {
@@ -266,15 +326,39 @@ void PhyloTree::print(int i, ofstream& out){
 			out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
 			print(it->second, out);
 		}
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloTree", "print");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void PhyloTree::printTreeNodes(string treefilename) {
+	try {
+		ofstream outTree;
+		openOutputFile(treefilename, outTree);
+		
+		//print treenodes
+		outTree << tree.size() << endl;
+		for (int i = 0; i < tree.size(); i++) {
+			outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+		}
+		
+		//print genus nodes
+		outTree << endl << uniqueTaxonomies.size() << endl;
+		map<int, int>::iterator it2;
+		for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;	}
+		outTree << endl;
 		
 		
+		outTree.close();
+		
 	}
 	catch(exception& e) {
-		m->errorOut(e, "PhyloTree", "print");
+		m->errorOut(e, "PhyloTree", "printTreeNodes");
 		exit(1);
 	}
 }
-
 /**************************************************************************************************/
 
 
diff --git a/phylotree.h b/phylotree.h
index 75abffc..b65a651 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -32,11 +32,14 @@ class PhyloTree {
 public:
 	PhyloTree();
 	PhyloTree(string);  //pass it a taxonomy file and it makes the tree
+	PhyloTree(ifstream&);  //pass it a taxonomy file and it makes the train.tree
 	~PhyloTree() {};
 	int addSeqToTree(string, string);
 	void assignHeirarchyIDs(int);
 	void print(ofstream&);
+	void printTreeNodes(string); //used by bayesian to save time
 	vector<int> getGenusNodes();
+	vector<int> getGenusTotals();	
 	void binUnclassified();
 		
 	TaxNode get(int i)				{	return tree[i];							}
@@ -45,17 +48,20 @@ public:
 	string getName(int i)			{	return tree[i].name;					}
 	string getFullTaxonomy(string);	 //pass a sequence name return taxonomy
 	int getMaxLevel()				{	return maxLevel;						}
+	int getNumSeqs()  {  return numSeqs;  }
 	
 private:
 	string getNextTaxon(string&);
 	vector<TaxNode> tree;
 	vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+	vector<int> totals; //holds the numSeqs at each genus level taxonomy
 	map<string, int> name2Taxonomy;  //maps name to index in tree
 	map<int, int> uniqueTaxonomies;  //map of unique taxonomies
 	void print(int, ofstream&);
 	int numNodes;
 	int numSeqs;
 	int maxLevel;
+	bool calcTotals;
 	MothurOut* m;
 };
 
diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp
new file mode 100644
index 0000000..362cdb7
--- /dev/null
+++ b/rawtrainingdatamaker.cpp
@@ -0,0 +1,188 @@
+/*
+ *  rawTrainingDataMaker.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "rawtrainingdatamaker.h"
+
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(){
+	try {
+		m = MothurOut::getInstance();
+		numNodes = 1;
+		numSeqs = 0;
+		tree.push_back(rawTaxNode("Root"));
+		tree[0].rank = "Root";
+		maxLevel = 0;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
+	try {
+		m = MothurOut::getInstance();
+		numNodes = 1;
+		numSeqs = 0;
+		tree.push_back(rawTaxNode("Root"));
+		tree[0].rank = "Root";
+		maxLevel = 0;
+		
+		ifstream in;
+		openInputFile(tfile, in);
+		
+		//read in users taxonomy file and add sequences to tree
+		string name, tax;
+		while(!in.eof()){
+			in >> name >> tax; gobble(in);
+			
+			addSeqToTree(name, tax);
+		}
+		in.close();
+	
+		assignRank(0);
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
+	try {
+		string currentLevel = "";
+		if(heirarchy != ""){
+			int pos = heirarchy.find_first_of(';');
+			currentLevel=heirarchy.substr(0,pos);
+			if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
+			else { heirarchy = ""; }
+		}
+		return currentLevel;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
+	try {
+		numSeqs++;
+		
+		map<string, int>::iterator childPointer;
+		
+		int currentNode = 0;
+		string taxon;
+		
+		while(seqTaxonomy != ""){
+			
+			if (m->control_pressed) { return 0; }
+			
+			//somehow the parent is getting one too many accnos
+			//use print to reassign the taxa id
+			taxon = getNextTaxon(seqTaxonomy);
+			
+			childPointer = tree[currentNode].children.find(taxon);
+			
+			if(childPointer != tree[currentNode].children.end()){	//if the node already exists, move on
+				currentNode = childPointer->second;
+			}else{											//otherwise, create it
+				tree.push_back(rawTaxNode(taxon));
+				numNodes++;
+				tree[currentNode].children[taxon] = numNodes-1;
+				tree[numNodes-1].parent = currentNode;
+				
+				currentNode = tree[currentNode].children[taxon];
+			}
+		}
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::assignRank(int index){
+	try {
+		map<string,int>::iterator it;
+				
+		string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
+		
+		for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+			tree[it->second].level = tree[index].level + 1;
+			
+			if (tree[it->second].level > 8) { 
+				tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
+			}else {
+				tree[it->second].rank = ranks[tree[it->second].level];
+			}
+						
+			//save maxLevel for binning the unclassified seqs
+			if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
+			
+			assignRank(it->second);
+		}
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "assignRank");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(ofstream& out){
+	try {
+		//string temp = tree[0].name +" " + tree[0].rank;
+		//sanityCheck[temp] = temp;
+		
+		out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
+		print(0, out);
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "print");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(int i, ofstream& out){
+	try {
+		map<string,int>::iterator it;
+		for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+			//string temp = tree[it->second].name + " " + tree[it->second].rank;
+			
+			//map<string, string>::iterator itSan;
+			//itSan = sanityCheck.find(temp);
+			
+			//if (itSan == sanityCheck.end()) {
+				out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
+				//sanityCheck[temp] = temp;
+			//}
+			print(it->second, out);
+		}
+	}
+	catch(exception& e) {
+		m->errorOut(e, "RawTrainingDataMaker", "print");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+
+	
diff --git a/rawtrainingdatamaker.h b/rawtrainingdatamaker.h
new file mode 100644
index 0000000..912d781
--- /dev/null
+++ b/rawtrainingdatamaker.h
@@ -0,0 +1,54 @@
+#ifndef RAWTRAININGDATAMAKER_H
+#define RAWTRAININGDATAMAKER_H
+
+/*
+ *  rawTrainingDataMaker.h
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+
+/**************************************************************************************************/
+
+struct rawTaxNode {
+	map<string, int> children;  //childs name to index in tree
+	int parent, level;
+	string name, rank;
+	
+	rawTaxNode(string n) : name(n), level(0), parent(-1) {		}
+	rawTaxNode(){}
+};
+
+/**************************************************************************************************/
+
+class RawTrainingDataMaker {
+
+public:
+	RawTrainingDataMaker();
+	RawTrainingDataMaker(string);  //pass it a taxonomy file and it makes the tree
+	~RawTrainingDataMaker() {};
+	int addSeqToTree(string, string);
+	void assignRank(int);
+	void print(ofstream&);
+	
+private:
+	string getNextTaxon(string&);
+	vector<rawTaxNode> tree;
+	void print(int, ofstream&);
+	int numNodes;
+	int numSeqs;
+	int maxLevel;
+	//map<string, string> sanityCheck;
+	MothurOut* m;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
diff --git a/readdistcommand.cpp b/readdistcommand.cpp
index bcecb78..7f60d71 100644
--- a/readdistcommand.cpp
+++ b/readdistcommand.cpp
@@ -175,7 +175,7 @@ void ReadDistCommand::help(){
 		m->mothurOut("For this use the read.dist command should be in the following format: \n");
 		m->mothurOut("read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) \n");
 		m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
-		m->mothurOut("The sim parameter is used to indicate that your distance file contains similiarity values instead of distance values. The default is false, if sim=true then mothur will convert the similairity values to distances. \n");
+		m->mothurOut("The sim parameter is used to indicate that your distance file contains similarity values instead of distance values. The default is false, if sim=true then mothur will convert the similarity values to distances. \n");
 		m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
 		m->mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
 		m->mothurOut("For this use the read.dist command should be in the following format: \n");
-- 
2.39.2