From cd9dbd8b53bbe32af3e9c6bead4aa6d796a278a5 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Tue, 27 Apr 2010 17:54:59 +0000
Subject: [PATCH] reworked the classifiers summary file added groupfile option
 to classify.seqs, added mpi code to classifiers modification

---
 Mothur.xcodeproj/project.pbxproj         |   8 +-
 bayesian.cpp                             | 256 +++++++++++++++----
 bayesian.h                               |   2 +-
 classify.cpp                             |   2 +
 classifyseqscommand.cpp                  |  63 ++---
 classifyseqscommand.h                    |   2 +-
 distancecommand.cpp                      |   3 +-
 makefile                                 |  12 +-
 phylosummary.cpp                         | 274 ++++++++++++++++++++
 rawtrainingdatamaker.h => phylosummary.h |  23 +-
 phylotree.cpp                            | 306 ++++++++++++++++-------
 phylotree.h                              |  10 +-
 rawtrainingdatamaker.cpp                 | 188 --------------
 13 files changed, 778 insertions(+), 371 deletions(-)
 create mode 100644 phylosummary.cpp
 rename rawtrainingdatamaker.h => phylosummary.h (65%)
 delete mode 100644 rawtrainingdatamaker.cpp
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 805bde0..cc3b4b6 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -12,8 +12,8 @@
 		A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
 		A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
 		A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
-		A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
-		A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
+		A76AAD02117F322B003D8DA1 /* phylosummary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylosummary.h; sourceTree = "<group>"; };
+		A76AAD03117F322B003D8DA1 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = "<group>"; };
 		A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
 		A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
 		A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
@@ -831,8 +831,8 @@
 				A7DA208A113FECD400BF472F /* knn.h */,
 				A7DA20C4113FECD400BF472F /* phylotree.cpp */,
 				A7DA20C5113FECD400BF472F /* phylotree.h */,
-				A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
-				A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
+				A76AAD02117F322B003D8DA1 /* phylosummary.h */,
+				A76AAD03117F322B003D8DA1 /* phylosummary.cpp */,
 				A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
 				A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
 			);
diff --git a/bayesian.cpp b/bayesian.cpp
index 0929749..e5543cd 100644
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -9,7 +9,7 @@
 
 #include "bayesian.h"
 #include "kmer.hpp"
-#include "rawtrainingdatamaker.h"
+#include "phylosummary.h"
 
 /**************************************************************************************************/
 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
@@ -18,30 +18,30 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 					
 		/************calculate the probablity that each word will be in a specific taxonomy*************/
 		string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
-		ifstream phyloTreeTest(phyloTreeName.c_str());
-		
-		ofstream out;
 		string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
-		ifstream probFileTest(probFileName.c_str());
+		string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
 		
+		ofstream out;
 		ofstream out2;
-		string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+		
+		ifstream phyloTreeTest(phyloTreeName.c_str());
 		ifstream probFileTest2(probFileName2.c_str());
+		ifstream probFileTest(probFileName.c_str());
 		
 		int start = time(NULL);
 		
 		if(probFileTest && probFileTest2 && phyloTreeTest){	
 			m->mothurOut("Reading template taxonomy...     "); cout.flush();
 			
-			phyloTree = new PhyloTree(phyloTreeTest);
-	
+			phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
+			
 			m->mothurOut("DONE."); m->mothurOutEndLine();
 			
 			genusNodes = phyloTree->getGenusNodes(); 
 			genusTotals = phyloTree->getGenusTotals();
 		
 			m->mothurOut("Reading template probabilities...     "); cout.flush();
-			readProbFile(probFileTest, probFileTest2);	
+			readProbFile(probFileTest, probFileTest2, probFileName, probFileName2);	
 			
 		}else{
 		
@@ -65,6 +65,14 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 			wordGenusProb.resize(numKmers);
 		
 			for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
+			
+			
+			#ifdef USE_MPI
+				int pid;
+				MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+				if (pid == 0) {  
+			#endif
 
 			ofstream out;
 			openOutputFile(probFileName, out);
@@ -74,12 +82,27 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 			ofstream out2;
 			openOutputFile(probFileName2, out2);
 			
+			#ifdef USE_MPI
+				}
+			#endif
+
+			
 			//for each word
 			for (int i = 0; i < numKmers; i++) {
 				if (m->control_pressed) { break; }
 				
+				#ifdef USE_MPI
+					MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+					if (pid == 0) {  
+				#endif
+
 				out << i << '\t';
 				
+				#ifdef USE_MPI
+					}
+				#endif
+				
 				vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
 				
 				map<int, int> count;
@@ -98,20 +121,52 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 				for (int k = 0; k < genusNodes.size(); k++) {
 					//probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
 					wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
-					if (count[genusNodes[k]] != 0) {  out << k << '\t' << wordGenusProb[i][k] << '\t';  numNotZero++;  }
+					if (count[genusNodes[k]] != 0) { 
+						#ifdef USE_MPI
+							MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+							if (pid == 0) {  
+						#endif
+
+						out << k << '\t' << wordGenusProb[i][k] << '\t'; 
+						
+						#ifdef USE_MPI
+							}
+						#endif
+
+						numNotZero++;  
+					}
 				}
+				
+				#ifdef USE_MPI
+					MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+					if (pid == 0) {  
+				#endif
+				
 				out << endl;
 				out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+				
+				#ifdef USE_MPI
+					}
+				#endif
 			}
 			
+			#ifdef USE_MPI
+				MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+				if (pid == 0) {  
+			#endif
+			
 			out.close();
 			out2.close();
 			
+			#ifdef USE_MPI
+				}
+			#endif
+			
 			//read in new phylotree with less info. - its faster
 			ifstream phyloTreeTest(phyloTreeName.c_str());
 			delete phyloTree;
 			
-			phyloTree = new PhyloTree(phyloTreeTest);
+			phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
 		}
 	
 		m->mothurOut("DONE."); m->mothurOutEndLine();
@@ -292,45 +347,162 @@ map<string, int> Bayesian::parseTaxMap(string newTax) {
 	}
 }
 /**************************************************************************************************/
-void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
 	try{
 		
-		in >> numKmers; gobble(in);
-		
-		//initialze probabilities
-		wordGenusProb.resize(numKmers);
-		
-		for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
+		#ifdef USE_MPI
+			
+			int pid, num, num2;
+			vector<long> positions;
+			vector<long> positions2;
+			
+			MPI_Status status; 
+			MPI_File inMPI;
+			MPI_File inMPI2;
+			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+			char inFileName[1024];
+			strcpy(inFileName, inNumName.c_str());
+			
+			char inFileName2[1024];
+			strcpy(inFileName2, inName.c_str());
+
+			MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+			MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2);  //comm, filename, mode, info, filepointer
+
+			if (pid == 0) {
+				positions = setFilePosEachLine(inNumName, num);
+				
+				//send file positions to all processes
+				MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+				MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
+				
+				positions2 = setFilePosEachLine(inName, num2);
+				
+				//send file positions to all processes
+				MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+				MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
+
+			}else{
+				MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+				positions.resize(num);
+				MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+				
+				MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+				positions2.resize(num2);
+				MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+
+			}
 		
-		int kmer, name, count;  count = 0;
-		vector<int> num; num.resize(numKmers);
-		float prob;
-		vector<float> zeroCountProb; zeroCountProb.resize(numKmers);		
-	
-		while (inNum) {
-			inNum >> zeroCountProb[count] >> num[count];  
-			count++;
-			gobble(inNum);
-		}
-		inNum.close();
-	
-		while(in) {
-			in >> kmer;
+			//read numKmers
+			int length = positions2[1] - positions2[0];
+			char* buf = new char[length];
+
+			MPI_File_read_at(inMPI2, positions2[0], buf, length, MPI_CHAR, &status);
+
+			string tempBuf = buf;
+			if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+			delete buf;
+
+			istringstream iss (tempBuf,istringstream::in);
+			iss >> numKmers;  
+			
+			//initialze probabilities
+			wordGenusProb.resize(numKmers);
+			
+			for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
 			
-			//set them all to zero value
-			for (int i = 0; i < genusNodes.size(); i++) {
-				wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+			int kmer, name;  
+			vector<int> numbers; numbers.resize(numKmers);
+			float prob;
+			vector<float> zeroCountProb; zeroCountProb.resize(numKmers);		
+
+			//read file 
+			for(int i=0;i<num;i++){
+				//read next sequence
+				length = positions[i+1] - positions[i];
+				char* buf4 = new char[length];
+
+				MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+				tempBuf = buf4;
+				if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+				delete buf4;
+
+				istringstream iss (tempBuf,istringstream::in);
+				iss >> zeroCountProb[i] >> numbers[i];  
 			}
 			
-			//get probs for nonzero values
-			for (int i = 0; i < num[kmer]; i++) {
-				in >> name >> prob;
-				wordGenusProb[kmer][name] = prob;
+			MPI_File_close(&inMPI);
+			
+			for(int i=1;i<num2;i++){
+				//read next sequence
+				length = positions2[i+1] - positions2[i];
+				char* buf4 = new char[length];
+
+				MPI_File_read_at(inMPI2, positions2[i], buf4, length, MPI_CHAR, &status);
+
+				tempBuf = buf4;
+				if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+				delete buf4;
+
+				istringstream iss (tempBuf,istringstream::in);
+				
+				iss >> kmer;
+				
+				//set them all to zero value
+				for (int i = 0; i < genusNodes.size(); i++) {
+					wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+				}
+				
+				//get probs for nonzero values
+				for (int i = 0; i < numbers[kmer]; i++) {
+					iss >> name >> prob;
+					wordGenusProb[kmer][name] = prob;
+				}
+				
 			}
+			MPI_File_close(&inMPI2);
+		#else
+		
+			in >> numKmers; gobble(in);
 			
-			gobble(in);
-		}
-		in.close();
+			//initialze probabilities
+			wordGenusProb.resize(numKmers);
+			
+			for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
+			
+			int kmer, name, count;  count = 0;
+			vector<int> num; num.resize(numKmers);
+			float prob;
+			vector<float> zeroCountProb; zeroCountProb.resize(numKmers);		
+		
+			while (inNum) {
+				inNum >> zeroCountProb[count] >> num[count];  
+				count++;
+				gobble(inNum);
+			}
+			inNum.close();
+		
+			while(in) {
+				in >> kmer;
+				
+				//set them all to zero value
+				for (int i = 0; i < genusNodes.size(); i++) {
+					wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+				}
+				
+				//get probs for nonzero values
+				for (int i = 0; i < num[kmer]; i++) {
+					in >> name >> prob;
+					wordGenusProb[kmer][name] = prob;
+				}
+				
+				gobble(in);
+			}
+			in.close();
+			
+		#endif
 	}
 	catch(exception& e) {
 		m->errorOut(e, "Bayesian", "readProbFile");
diff --git a/bayesian.h b/bayesian.h
index 6e35e9f..70757be 100644
--- a/bayesian.h
+++ b/bayesian.h
@@ -34,7 +34,7 @@ private:
 	
 	string bootstrapResults(vector<int>, int, int);
 	int getMostProbableTaxonomy(vector<int>);
-	void readProbFile(ifstream&, ifstream&);
+	void readProbFile(ifstream&, ifstream&, string, string);
 	
 };
 
diff --git a/classify.cpp b/classify.cpp
index be287a0..147f499 100644
--- a/classify.cpp
+++ b/classify.cpp
@@ -237,6 +237,8 @@ int Classify::readTaxonomy(string file) {
 	
 		phyloTree->assignHeirarchyIDs(0);
 		
+		phyloTree->setUp(file);
+		
 		m->mothurOut("DONE.");
 		m->mothurOutEndLine();	cout.flush();
 		
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index a38bf8f..3837849 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -11,6 +11,7 @@
 #include "sequence.hpp"
 #include "bayesian.h"
 #include "phylotree.h"
+#include "phylosummary.h"
 #include "knn.h"
 
 //**********************************************************************************************************************
@@ -25,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
 		else {
 			
 			//valid paramters for this command
-			string AlignArray[] =  {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
+			string AlignArray[] =  {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
 			vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -62,6 +63,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
 					//if the user has not given a path then, add inputdir. else leave path alone.
 					if (path == "") {	parameters["taxonomy"] = inputDir + it->second;		}
 				}
+				
+				it = parameters.find("group");
+				//user has given a template file
+				if(it != parameters.end()){ 
+					path = hasPath(it->second);
+					//if the user has not given a path then, add inputdir. else leave path alone.
+					if (path == "") {	parameters["group"] = inputDir + it->second;		}
+				}
 			}
 
 			//check for required parameters
@@ -73,6 +82,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
 			}
 			else if (templateFileName == "not open") { abort = true; }	
 			
+			groupfile = validParameter.validFile(parameters, "group", true);
+			if (groupfile == "not open") { abort = true; }	
+			else if (groupfile == "not found") { groupfile = ""; }
+			
 			fastaFileName = validParameter.validFile(parameters, "fasta", false);
 			if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true;  }
 			else { 
@@ -250,6 +263,7 @@ void ClassifySeqsCommand::help(){
 		m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
 		m->mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
 		m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
+		m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n");
 		m->mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
 		m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
 		m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
@@ -491,50 +505,40 @@ int ClassifySeqsCommand::execute(){
 			m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
 			start = time(NULL);
 			
-			//make taxonomy tree from new taxonomy file 
-			PhyloTree taxaBrowser;
+			PhyloSummary taxaSum(taxonomyFileName, groupfile);
 			
 			if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str());	} delete classify; return 0; }
 			
-			ifstream in;
-			openInputFile(tempTaxonomyFile, in);
-		
-			//read in users taxonomy file and add sequences to tree
-			string name, taxon;
-			while(!in.eof()){
-				in >> name >> taxon; gobble(in);
-				
-				if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str());	} remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
+			if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
+			else {
+				ifstream in;
+				openInputFile(tempTaxonomyFile, in);
 				
-				if (namefile != "") {
+				//read in users taxonomy file and add sequences to tree
+				string name, taxon;
+				while(!in.eof()){
+					in >> name >> taxon; gobble(in);
+					
 					itNames = nameMap.find(name);
 		
 					if (itNames == nameMap.end()) { 
 						m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
 					}else{
 						for (int i = 0; i < itNames->second; i++) { 
-							taxaBrowser.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
+							taxaSum.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
 						}
 					}
-				}else {  taxaBrowser.addSeqToTree(name, taxon);  } //add it once
+				}
+				in.close();
 			}
-			in.close();
-	
-			taxaBrowser.assignHeirarchyIDs(0);
-			
-			if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str());	} remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
-
-			taxaBrowser.binUnclassified();
-			
 			remove(tempTaxonomyFile.c_str());
 			
 			if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str());	} delete classify; return 0; }
-
 			
 			//print summary file
 			ofstream outTaxTree;
 			openOutputFile(taxSummary, outTaxTree);
-			taxaBrowser.print(outTaxTree);
+			taxaSum.print(outTaxTree);
 			outTaxTree.close();
 			
 			//output taxonomy with the unclassified bins added
@@ -546,9 +550,10 @@ int ClassifySeqsCommand::execute(){
 			openOutputFile(unclass, outTax);
 			
 			//get maxLevel from phylotree so you know how many 'unclassified's to add
-			int maxLevel = taxaBrowser.getMaxLevel();
+			int maxLevel = taxaSum.getMaxLevel();
 			
 			//read taxfile - this reading and rewriting is done to preserve the confidence scores.
+			string name, taxon;
 			while (!inTax.eof()) {
 				if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str());	} remove(unclass.c_str()); delete classify; return 0; }
 
@@ -564,6 +569,9 @@ int ClassifySeqsCommand::execute(){
 			remove(newTaxonomyFile.c_str());
 			rename(unclass.c_str(), newTaxonomyFile.c_str());
 			
+			m->mothurOutEndLine();
+			m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+			
 			#ifdef USE_MPI	
 				}
 			#endif
@@ -572,10 +580,7 @@ int ClassifySeqsCommand::execute(){
 			m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 			for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i]); m->mothurOutEndLine();	}
 			m->mothurOutEndLine();
-
 			
-			//m->mothurOutEndLine();
-			//m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
 		}
 		
 		delete classify;
diff --git a/classifyseqscommand.h b/classifyseqscommand.h
index 0ffd18c..d19f862 100644
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -48,7 +48,7 @@ private:
 	
 	Classify* classify;
 	
-	string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir;
+	string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
 	int processors, kmerSize, numWanted, cutoff, iters;
 	float match, misMatch, gapOpen, gapExtend;
 	bool abort, probs;
diff --git a/distancecommand.cpp b/distancecommand.cpp
index 10a99dd..da5d0f1 100644
--- a/distancecommand.cpp
+++ b/distancecommand.cpp
@@ -278,8 +278,7 @@ int DistanceCommand::execute(){
 					delete buf;
 
 					int count = 0;
-					while (count < fileSize) { //read 1000 characters at a time
-						//send freqs
+					while (count < fileSize) { 
 						char buf2[1];
 						MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
 						MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
diff --git a/makefile b/makefile
index 5812d45..d4aac7f 100644
--- a/makefile
+++ b/makefile
@@ -220,7 +220,7 @@ mothur : \
 		./classify.o\
 		./phylotree.o\
 		./bayesian.o\
-		./rawtrainingdatamaker.o\
+		./phylosummary.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -418,7 +418,7 @@ mothur : \
 		./classify.o\
 		./phylotree.o\
 		./bayesian.o\
-		./rawtrainingdatamaker.o\
+		./phylosummary.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -619,7 +619,7 @@ clean :
 		./classify.o\
 		./phylotree.o\
 		./bayesian.o\
-		./rawtrainingdatamaker.o\
+		./phylosummary.o\
 		./alignmentdb.o\
 		./knn.o\
 		./distancedb.o\
@@ -1623,9 +1623,9 @@ install : mothur
 ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp
 	$(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o
 
-# Item # 171 -- rawtrainingdatamaker --
-./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp
-	$(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o
+# Item # 171 -- phylosummary --
+./phylosummary.o : phylosummary.cpp
+	$(CC) $(CC_OPTIONS) phylosummary.cpp -c $(INCLUDE) -o ./phylosummary.o
 
 
 
diff --git a/phylosummary.cpp b/phylosummary.cpp
new file mode 100644
index 0000000..1e8d1bc
--- /dev/null
+++ b/phylosummary.cpp
@@ -0,0 +1,274 @@
+/*
+ *  rawTrainingDataMaker.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "phylosummary.h"
+
+/**************************************************************************************************/
+
+PhyloSummary::PhyloSummary(string refTfile, string groupFile){
+	try {
+		m = MothurOut::getInstance();
+		maxLevel = 0;
+		
+		if (groupFile != "") {
+			groupmap = new GroupMap(groupFile);
+			groupmap->readMap();
+		}else{
+			groupmap = NULL;
+		}
+				
+		//check for necessary files
+		string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
+		ifstream FileTest(taxFileNameTest.c_str());
+		
+		if (!FileTest) { 
+			m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
+		}else{
+			readTreeStruct(FileTest);
+		}
+		
+		tree[0].rank = "0";
+		assignRank(0);
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "PhyloSummary");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+void PhyloSummary::summarize(string userTfile){
+	try {
+		
+		ifstream in;
+		openInputFile(userTfile, in);
+		
+		//read in users taxonomy file and add sequences to tree
+		string name, tax;
+		while(!in.eof()){
+			in >> name >> tax; gobble(in);
+			
+			addSeqToTree(name, tax);
+			
+			if (m->control_pressed) { break;  }
+		}
+		in.close();
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "summarize");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+string PhyloSummary::getNextTaxon(string& heirarchy){
+	try {
+		string currentLevel = "";
+		if(heirarchy != ""){
+			int pos = heirarchy.find_first_of(';');
+			currentLevel=heirarchy.substr(0,pos);
+			if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
+			else { heirarchy = ""; }
+		}
+		return currentLevel;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "getNextTaxon");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
+	try {
+		numSeqs++;
+		
+		map<string, int>::iterator childPointer;
+		
+		int currentNode = 0;
+		string taxon;
+		
+		int level = 0;
+		
+		while (seqTaxonomy != "") {
+			
+			if (m->control_pressed) { return 0; }
+			
+			//somehow the parent is getting one too many accnos
+			//use print to reassign the taxa id
+			taxon = getNextTaxon(seqTaxonomy);
+			
+			childPointer = tree[currentNode].children.find(taxon);
+			
+			if(childPointer != tree[currentNode].children.end()){	//if the node already exists, update count and move on
+				if (groupmap != NULL) {
+					//find out the sequences group
+					string group = groupmap->getGroup(seqName);
+					
+					//do you have a count for this group?
+					map<string, int>::iterator itGroup = tree[currentNode].groupCount.find(group);
+					
+					//if yes, increment it - there should not be a case where we can't find it since we load group in read
+					if (itGroup != tree[currentNode].groupCount.end()) {
+						tree[currentNode].groupCount[group]++;
+					}
+				}
+				
+				tree[currentNode].total++;
+
+				currentNode = childPointer->second;
+			}else{											//otherwise, create it
+				m->mothurOut("Error: cannot find taxonomy in tree for " + seqName + "."); m->mothurOutEndLine();
+				seqTaxonomy = "";
+			}
+			
+			level++;
+			
+			if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
+				for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
+			}
+		}
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "addSeqToTree");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+void PhyloSummary::assignRank(int index){
+	try {
+		map<string,int>::iterator it;
+		int counter = 1;
+		
+		for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+			tree[it->second].rank = tree[index].rank + '.' + toString(counter);
+			counter++;
+									
+			assignRank(it->second);
+		}
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "assignRank");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
+void PhyloSummary::print(ofstream& out){
+	try {
+		//print labels
+		out << "taxlevel\t rank ID\t label\t daughterlevels\t total\t";
+		if (groupmap != NULL) {
+			for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
+				out << groupmap->namesOfGroups[i] << '\t';
+			}
+		}
+		
+		out << endl;
+		
+		//print root
+		out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << tree[0].children.size() << "\t" << tree[0].total << "\t";
+		
+		map<string, int>::iterator itGroup;
+		if (groupmap != NULL) {
+			for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
+				out << itGroup->second << '\t';
+			}
+		}
+		out << endl;
+		
+		//print rest
+		print(0, out);
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "print");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+void PhyloSummary::print(int i, ofstream& out){
+	try {
+		map<string,int>::iterator it;
+		for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+			
+			if (tree[it->second].total != 0)  {
+				out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << tree[it->second].children.size() << "\t" << tree[it->second].total << "\t";
+				
+				map<string, int>::iterator itGroup;
+				if (groupmap != NULL) {
+					for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
+						out << itGroup->second << '\t';
+					}
+				}
+				out << endl;
+			}
+			
+			print(it->second, out);
+		}
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "print");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void PhyloSummary::readTreeStruct(ifstream& in){
+	try {
+		int num;
+		
+		in >> num; gobble(in);
+		
+		tree.resize(num);
+	
+		//read the tree file
+		for (int i = 0; i < tree.size(); i++) {
+	
+			in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
+			
+			//set children
+			string childName;
+			int childIndex;
+			for (int j = 0; j < num; j++) {
+				in >> childName >> childIndex;
+				tree[i].children[childName] = childIndex;
+			}
+			
+			//initialize groupcounts
+			if (groupmap != NULL) {
+				for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
+					tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
+				}
+			}
+			
+			tree[i].total = 0;
+			
+			gobble(in);
+			
+			if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
+		}
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloSummary", "print");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+
+	
diff --git a/rawtrainingdatamaker.h b/phylosummary.h
similarity index 65%
rename from rawtrainingdatamaker.h
rename to phylosummary.h
index 912d781..9ebdf81 100644
--- a/rawtrainingdatamaker.h
+++ b/phylosummary.h
@@ -12,6 +12,7 @@
 
 #include "mothur.h"
 #include "mothurout.h"
+#include "groupmap.h"
 
 /**************************************************************************************************/
 
@@ -19,31 +20,37 @@ struct rawTaxNode {
 	map<string, int> children;  //childs name to index in tree
 	int parent, level;
 	string name, rank;
+	map<string, int> groupCount;
+	int total;
 	
-	rawTaxNode(string n) : name(n), level(0), parent(-1) {		}
+	rawTaxNode(string n) : name(n), level(0), parent(-1), total(0) {}
 	rawTaxNode(){}
 };
 
 /**************************************************************************************************/
-
-class RawTrainingDataMaker {
+//doesn't use MPI ifdefs since only pid 0 uses this class
+class PhyloSummary {
 
 public:
-	RawTrainingDataMaker();
-	RawTrainingDataMaker(string);  //pass it a taxonomy file and it makes the tree
-	~RawTrainingDataMaker() {};
+	PhyloSummary(string, string);
+	~PhyloSummary() { if (groupmap != NULL)  {  delete groupmap;  }  }
+	
+	void summarize(string);  //pass it a taxonomy file and a group file and it makes the tree
 	int addSeqToTree(string, string);
-	void assignRank(int);
 	void print(ofstream&);
+	int getMaxLevel() { return maxLevel; }
 	
 private:
 	string getNextTaxon(string&);
 	vector<rawTaxNode> tree;
 	void print(int, ofstream&);
+	void assignRank(int);
+	void readTreeStruct(ifstream&);
+	GroupMap* groupmap;
+	
 	int numNodes;
 	int numSeqs;
 	int maxLevel;
-	//map<string, string> sanityCheck;
 	MothurOut* m;
 };
 
diff --git a/phylotree.cpp b/phylotree.cpp
index 33aedc6..8c48e4f 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -28,34 +28,78 @@ PhyloTree::PhyloTree(){
 }
 /**************************************************************************************************/
 
-PhyloTree::PhyloTree(ifstream& in){
+PhyloTree::PhyloTree(ifstream& in, string filename){
 	try {
 		m = MothurOut::getInstance();
 		calcTotals = false;
 		
-		in >> numNodes; gobble(in);
-		
-		tree.resize(numNodes);
-		
-		for (int i = 0; i < tree.size(); i++) {
-			in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
- 
-		}
-		
-		//read genus nodes
-		int numGenus = 0;
-		in >> numGenus; gobble(in);
-	
-		int gnode, gsize;
-		totals.clear();
-		for (int i = 0; i < numGenus; i++) {
-			in >> gnode >> gsize; gobble(in);
+		#ifdef USE_MPI
+			MPI_File inMPI;
+			MPI_Offset size;
+			MPI_Status status;
+
+			char inFileName[1024];
+			strcpy(inFileName, filename.c_str());
+
+			MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
+			MPI_File_get_size(inMPI, &size);
 			
-			uniqueTaxonomies[gnode] = gnode;
-			totals.push_back(gsize);
-		}
+			char* buffer = new char[size];
+			MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
+
+			string tempBuf = buffer;
+			if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
+			istringstream iss (tempBuf,istringstream::in);
+			delete buffer;
+			
+			iss >> numNodes; gobble(iss);
+			
+			tree.resize(numNodes);
+			
+			for (int i = 0; i < tree.size(); i++) {
+				iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
+			}
 			
-		in.close();
+			//read genus nodes
+			int numGenus = 0;
+			iss >> numGenus; gobble(iss);
+			
+			int gnode, gsize;
+			totals.clear();
+			for (int i = 0; i < numGenus; i++) {
+				iss >> gnode >> gsize; gobble(iss);
+				
+				uniqueTaxonomies[gnode] = gnode;
+				totals.push_back(gsize);
+			}
+			
+			MPI_File_close(&inMPI);
+			
+		#else
+			in >> numNodes; gobble(in);
+			
+			tree.resize(numNodes);
+			
+			for (int i = 0; i < tree.size(); i++) {
+				in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+			}
+			
+			//read genus nodes
+			int numGenus = 0;
+			in >> numGenus; gobble(in);
+			
+			int gnode, gsize;
+			totals.clear();
+			for (int i = 0; i < numGenus; i++) {
+				in >> gnode >> gsize; gobble(in);
+				
+				uniqueTaxonomies[gnode] = gnode;
+				totals.push_back(gsize);
+			}
+			
+			in.close();
+			
+		#endif
 		
 	}
 	catch(exception& e) {
@@ -74,20 +118,70 @@ PhyloTree::PhyloTree(string tfile){
 		tree[0].heirarchyID = "0";
 		maxLevel = 0;
 		calcTotals = true;
+		string name, tax;
+
 		
-		ifstream in;
-		openInputFile(tfile, in);
+		#ifdef USE_MPI
+			int pid, num;
+			vector<long> positions;
+			
+			MPI_Status status; 
+			MPI_File inMPI;
+			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+			char inFileName[1024];
+			strcpy(inFileName, tfile.c_str());
+
+			MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+
+			if (pid == 0) {
+				positions = setFilePosEachLine(tfile, num);
+				
+				//send file positions to all processes
+				MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+				MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos	
+			}else{
+				MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+				positions.resize(num);
+				MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+			}
 		
-		//read in users taxonomy file and add sequences to tree
-		string name, tax;
-		while(!in.eof()){
-			in >> name >> tax; gobble(in);
+			//read file 
+			for(int i=0;i<num;i++){
+				//read next sequence
+				int length = positions[i+1] - positions[i];
+				char* buf4 = new char[length];
+
+				MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+				string tempBuf = buf4;
+				if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+				delete buf4;
+
+				istringstream iss (tempBuf,istringstream::in);
+				iss >> name >> tax;
+				addSeqToTree(name, tax);
+			}
 			
-			addSeqToTree(name, tax);
-		}
-		in.close();
-	
+			MPI_File_close(&inMPI);
+		
+		#else
+			ifstream in;
+			openInputFile(tfile, in);
+			
+			//read in users taxonomy file and add sequences to tree
+			while(!in.eof()){
+				in >> name >> tax; gobble(in);
+				
+				addSeqToTree(name, tax);
+			}
+			in.close();
+		#endif
+		
 		assignHeirarchyIDs(0);
+		
+		//create file for summary if needed
+		setUp(tfile);
 	}
 	catch(exception& e) {
 		m->errorOut(e, "PhyloTree", "PhyloTree");
@@ -232,15 +326,49 @@ void PhyloTree::assignHeirarchyIDs(int index){
 	}
 }
 /**************************************************************************************************/
-void PhyloTree::binUnclassified(){
+void PhyloTree::setUp(string tfile){
+	try{
+		string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
+		
+		#ifdef USE_MPI
+			int pid;
+			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+			if (pid == 0) {  binUnclassified(taxFileNameTest);  }
+		
+		#else
+			//create file needed for summary if it doesn't exist
+			ifstream FileTest(taxFileNameTest.c_str());
+			
+			if (!FileTest) { 
+				binUnclassified(taxFileNameTest); 
+			}
+		#endif
+	}
+	catch(exception& e) {
+		m->errorOut(e, "PhyloTree", "setUp");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void PhyloTree::binUnclassified(string file){
 	try {
+	
+		ofstream out;
+		openOutputFile(file, out);
+		
 		map<string, int>::iterator itBin;
 		map<string, int>::iterator childPointer;
 		
+		vector<TaxNode> copy = tree;
+		int copyNodes = numNodes;
+		
 		//go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
 		for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) {
-		
-			int level = tree[itBin->second].level;
+			
+			if (m->control_pressed) {  out.close(); break;  }
+			
+			int level = copy[itBin->second].level;
 			int currentNode = itBin->second;
 			
 			//this sequence is unclassified at some levels
@@ -251,34 +379,30 @@ void PhyloTree::binUnclassified(){
 				string taxon = "unclassified";	
 				
 				//does the parent have a child names 'unclassified'?
-				childPointer = tree[currentNode].children.find(taxon);
+				childPointer = copy[currentNode].children.find(taxon);
 				
-				if(childPointer != tree[currentNode].children.end()){	//if the node already exists, move on
+				if(childPointer != copy[currentNode].children.end()){	//if the node already exists, move on
 					currentNode = childPointer->second; //currentNode becomes 'unclassified'
-					tree[currentNode].accessions.push_back(itBin->first);  //add this seq
-					name2Taxonomy[itBin->first] = currentNode;
+					copy[currentNode].accessions.push_back(itBin->first);  //add this seq
 				}
 				else{											//otherwise, create it
-					tree.push_back(TaxNode(taxon));
-					numNodes++;
-					tree[currentNode].children[taxon] = numNodes-1;
-					tree[numNodes-1].parent = currentNode;
+					copy.push_back(TaxNode(taxon));
+					copyNodes++;
+					copy[currentNode].children[taxon] = copyNodes-1;
+					copy[copyNodes-1].parent = currentNode;
+					copy[copyNodes-1].level = copy[currentNode].level + 1;
 									
-					currentNode = tree[currentNode].children[taxon];
-					tree[currentNode].accessions.push_back(itBin->first);
-					name2Taxonomy[itBin->first] = currentNode;
+					currentNode = copy[currentNode].children[taxon];
+					copy[currentNode].accessions.push_back(itBin->first);
 				}
-				
-				if (level == maxLevel) {   uniqueTaxonomies[currentNode] = currentNode;	}
 			}
 		}
 		
-		//clear HeirarchyIDs and reset them
-		for (int i = 1; i < tree.size(); i++) {
-			tree[i].heirarchyID = "";
+		if (!m->control_pressed) {
+			//print copy tree
+			print(out, copy);
 		}
-		assignHeirarchyIDs(0);
-		
+				
 	}
 	catch(exception& e) {
 		m->errorOut(e, "PhyloTree", "binUnclassified");
@@ -306,26 +430,22 @@ string PhyloTree::getFullTaxonomy(string seqName) {
 }
 /**************************************************************************************************/
 
-void PhyloTree::print(ofstream& out){
-	try {
-		out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
-		print(0, out);
-	}
-	catch(exception& e) {
-		m->errorOut(e, "PhyloTree", "print");
-		exit(1);
-	}
-}
-
-/**************************************************************************************************/
-
-void PhyloTree::print(int i, ofstream& out){
+void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
 	try {
-		map<string,int>::iterator it;
-		for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
-			out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
-			print(it->second, out);
+		out << copy.size() << endl;
+		
+		for (int i = 0; i < copy.size(); i++) {
+	
+			out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
+			
+			map<string,int>::iterator it;
+			for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
+				out << it->first << '\t' << it->second << '\t';
+			}
+			out << endl;
 		}
+		
+		out.close();
 	}
 	catch(exception& e) {
 		m->errorOut(e, "PhyloTree", "print");
@@ -335,23 +455,37 @@ void PhyloTree::print(int i, ofstream& out){
 /**************************************************************************************************/
 void PhyloTree::printTreeNodes(string treefilename) {
 	try {
-		ofstream outTree;
-		openOutputFile(treefilename, outTree);
-		
-		//print treenodes
-		outTree << tree.size() << endl;
-		for (int i = 0; i < tree.size(); i++) {
-			outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
-		}
-		
-		//print genus nodes
-		outTree << endl << uniqueTaxonomies.size() << endl;
-		map<int, int>::iterator it2;
-		for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;	}
-		outTree << endl;
+	
+		#ifdef USE_MPI
+			int pid;
+			MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+			if (pid == 0) {  
 		
+		#endif
+
+			ofstream outTree;
+			openOutputFile(treefilename, outTree);
+			
+			//print treenodes
+			outTree << tree.size() << endl;
+			for (int i = 0; i < tree.size(); i++) {
+				outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+			}
+			
+			//print genus nodes
+			outTree << endl << uniqueTaxonomies.size() << endl;
+			map<int, int>::iterator it2;
+			for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;	}
+			outTree << endl;
+			
+			
+			outTree.close();
 		
-		outTree.close();
+		#ifdef USE_MPI
+			}
+		#endif
+
 		
 	}
 	catch(exception& e) {
diff --git a/phylotree.h b/phylotree.h
index b65a651..a57cf25 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -32,15 +32,14 @@ class PhyloTree {
 public:
 	PhyloTree();
 	PhyloTree(string);  //pass it a taxonomy file and it makes the tree
-	PhyloTree(ifstream&);  //pass it a taxonomy file and it makes the train.tree
+	PhyloTree(ifstream&, string);  //pass it a taxonomy file and it makes the train.tree
 	~PhyloTree() {};
 	int addSeqToTree(string, string);
 	void assignHeirarchyIDs(int);
-	void print(ofstream&);
 	void printTreeNodes(string); //used by bayesian to save time
 	vector<int> getGenusNodes();
 	vector<int> getGenusTotals();	
-	void binUnclassified();
+	void setUp(string);  //used to create file needed for summary file if you use () constructor and add seqs manually instead of passing taxonomyfile
 		
 	TaxNode get(int i)				{	return tree[i];							}
 	TaxNode get(string seqName)		{	return tree[name2Taxonomy[seqName]];	}
@@ -52,12 +51,15 @@ public:
 	
 private:
 	string getNextTaxon(string&);
+	void print(ofstream&, vector<TaxNode>&);
+	void binUnclassified(string);
+	
 	vector<TaxNode> tree;
 	vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
 	vector<int> totals; //holds the numSeqs at each genus level taxonomy
 	map<string, int> name2Taxonomy;  //maps name to index in tree
 	map<int, int> uniqueTaxonomies;  //map of unique taxonomies
-	void print(int, ofstream&);
+	//void print(int, ofstream&);
 	int numNodes;
 	int numSeqs;
 	int maxLevel;
diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp
deleted file mode 100644
index 362cdb7..0000000
--- a/rawtrainingdatamaker.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- *  rawTrainingDataMaker.cpp
- *  Mothur
- *
- *  Created by westcott on 4/21/10.
- *  Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "rawtrainingdatamaker.h"
-
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(){
-	try {
-		m = MothurOut::getInstance();
-		numNodes = 1;
-		numSeqs = 0;
-		tree.push_back(rawTaxNode("Root"));
-		tree[0].rank = "Root";
-		maxLevel = 0;
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
-	try {
-		m = MothurOut::getInstance();
-		numNodes = 1;
-		numSeqs = 0;
-		tree.push_back(rawTaxNode("Root"));
-		tree[0].rank = "Root";
-		maxLevel = 0;
-		
-		ifstream in;
-		openInputFile(tfile, in);
-		
-		//read in users taxonomy file and add sequences to tree
-		string name, tax;
-		while(!in.eof()){
-			in >> name >> tax; gobble(in);
-			
-			addSeqToTree(name, tax);
-		}
-		in.close();
-	
-		assignRank(0);
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
-		exit(1);
-	}
-}
-
-/**************************************************************************************************/
-
-string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
-	try {
-		string currentLevel = "";
-		if(heirarchy != ""){
-			int pos = heirarchy.find_first_of(';');
-			currentLevel=heirarchy.substr(0,pos);
-			if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
-			else { heirarchy = ""; }
-		}
-		return currentLevel;
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
-		exit(1);
-	}
-}
-
-/**************************************************************************************************/
-
-int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
-	try {
-		numSeqs++;
-		
-		map<string, int>::iterator childPointer;
-		
-		int currentNode = 0;
-		string taxon;
-		
-		while(seqTaxonomy != ""){
-			
-			if (m->control_pressed) { return 0; }
-			
-			//somehow the parent is getting one too many accnos
-			//use print to reassign the taxa id
-			taxon = getNextTaxon(seqTaxonomy);
-			
-			childPointer = tree[currentNode].children.find(taxon);
-			
-			if(childPointer != tree[currentNode].children.end()){	//if the node already exists, move on
-				currentNode = childPointer->second;
-			}else{											//otherwise, create it
-				tree.push_back(rawTaxNode(taxon));
-				numNodes++;
-				tree[currentNode].children[taxon] = numNodes-1;
-				tree[numNodes-1].parent = currentNode;
-				
-				currentNode = tree[currentNode].children[taxon];
-			}
-		}
-
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::assignRank(int index){
-	try {
-		map<string,int>::iterator it;
-				
-		string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
-		
-		for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
-			tree[it->second].level = tree[index].level + 1;
-			
-			if (tree[it->second].level > 8) { 
-				tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
-			}else {
-				tree[it->second].rank = ranks[tree[it->second].level];
-			}
-						
-			//save maxLevel for binning the unclassified seqs
-			if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
-			
-			assignRank(it->second);
-		}
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "assignRank");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(ofstream& out){
-	try {
-		//string temp = tree[0].name +" " + tree[0].rank;
-		//sanityCheck[temp] = temp;
-		
-		out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
-		print(0, out);
-		
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "print");
-		exit(1);
-	}
-}
-
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(int i, ofstream& out){
-	try {
-		map<string,int>::iterator it;
-		for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
-			//string temp = tree[it->second].name + " " + tree[it->second].rank;
-			
-			//map<string, string>::iterator itSan;
-			//itSan = sanityCheck.find(temp);
-			
-			//if (itSan == sanityCheck.end()) {
-				out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
-				//sanityCheck[temp] = temp;
-			//}
-			print(it->second, out);
-		}
-	}
-	catch(exception& e) {
-		m->errorOut(e, "RawTrainingDataMaker", "print");
-		exit(1);
-	}
-}
-/**************************************************************************************************/
-
-
-	
-- 
2.39.2