From b5a791c81d432082bf38755a08b33863f255341d Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Wed, 23 Dec 2009 18:47:52 +0000
Subject: [PATCH] precluster command finished

---
 Mothur.xcodeproj/project.pbxproj |   6 +
 commandfactory.cpp               |   3 +
 distancecommand.cpp              |   2 +-
 distancecommand.h                |   2 +-
 hcluster.cpp                     |  97 +++++++++--
 hcluster.h                       |   8 +-
 hclustercommand.cpp              | 105 ++----------
 hclustercommand.h                |  18 +--
 mgclustercommand.cpp             | 106 ++----------
 mgclustercommand.h               |   8 +-
 mothur.h                         |  96 ++++++++++-
 preclustercommand.cpp            | 269 +++++++++++++++++++++++++++++++
 preclustercommand.h              |  59 +++++++
 readblast.cpp                    |   8 +-
 readblast.h                      |  13 +-
 readcluster.cpp                  |  67 +-------
 readcluster.h                    |   3 -
 17 files changed, 563 insertions(+), 307 deletions(-)
 create mode 100644 preclustercommand.cpp
 create mode 100644 preclustercommand.h
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 5ae6f33..1fbb5b7 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -158,6 +158,7 @@
 		7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; };
 		7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; };
 		8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; };
+		A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */; };
 		A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
 		A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
 		A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
@@ -526,6 +527,8 @@
 		7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = SOURCE_ROOT; };
 		7EC3D4540FA0FFF900338DA5 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = SOURCE_ROOT; };
 		8DD76F6C0486A84900D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
+		A7027F2610DFF86D00BF45FE /* preclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = preclustercommand.h; sourceTree = SOURCE_ROOT; };
+		A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = preclustercommand.cpp; sourceTree = SOURCE_ROOT; };
 		A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getgroupcommand.cpp; sourceTree = SOURCE_ROOT; };
 		A70B53A50F4CD7AD0064797E /* getgroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getgroupcommand.h; sourceTree = SOURCE_ROOT; };
 		A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlabelcommand.cpp; sourceTree = SOURCE_ROOT; };
@@ -931,6 +934,8 @@
 				3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
 				A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
 				A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
+				A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
+				A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */,
 				37D927FE0F21331F001D4494 /* quitcommand.h */,
 				37D927FD0F21331F001D4494 /* quitcommand.cpp */,
 				37D928080F21331F001D4494 /* rarefactcommand.h */,
@@ -1299,6 +1304,7 @@
 				A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */,
 				A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
 				A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
+				A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/commandfactory.cpp b/commandfactory.cpp
index b361b66..d2b22d5 100644
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -61,6 +61,7 @@
 #include "classifyseqscommand.h"
 #include "phylotypecommand.h"
 #include "mgclustercommand.h"
+#include "preclustercommand.h"
 
 /*******************************************************/
 
@@ -131,6 +132,7 @@ CommandFactory::CommandFactory(){
 	commands["classify.seqs"]		= "classify.seqs"; 
 	commands["phylotype"]			= "phylotype";
 	commands["mgcluster"]			= "mgcluster";
+	commands["pre.cluster"]			= "pre.cluster";
 
 }
 /***********************************************************/
@@ -199,6 +201,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
 		else if(commandName == "classify.seqs")			{	command = new ClassifySeqsCommand(optionString);		}
 		else if(commandName == "phylotype")				{	command = new PhylotypeCommand(optionString);			}
 		else if(commandName == "mgcluster")				{	command = new MGClusterCommand(optionString);			}
+		else if(commandName == "pre.cluster")			{	command = new PreClusterCommand(optionString);			}
 		else											{	command = new NoCommand(optionString);					}
 
 		return command;
diff --git a/distancecommand.cpp b/distancecommand.cpp
index f8a2a75..3ac3969 100644
--- a/distancecommand.cpp
+++ b/distancecommand.cpp
@@ -278,7 +278,7 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float
 	}
 }
 
-/**************************************************************************************************/
+/**************************************************************************************************
 void DistanceCommand::appendFiles(string temp, string filename) {
 	try{
 		ofstream output;
diff --git a/distancecommand.h b/distancecommand.h
index 6a7ac93..542f428 100644
--- a/distancecommand.h
+++ b/distancecommand.h
@@ -43,7 +43,7 @@ private:
 	bool abort;
 	vector<string>  Estimators; //holds estimators to be used
 	
-	void appendFiles(string, string);
+	//void appendFiles(string, string);
 	void createProcesses(string);
 	int driver(/*Dist*, SequenceDB, */int, int, string, float);
 
diff --git a/hcluster.cpp b/hcluster.cpp
index 2d88533..4e8c994 100644
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -12,11 +12,13 @@
 #include "listvector.hpp"
 #include "sparsematrix.hpp"
 
+
 /***********************************************************************/
 
-HCluster::HCluster(RAbundVector* rav, ListVector* lv) :  rabund(rav), list(lv){
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) :  rabund(rav), list(lv), method(m){
 	try {
 		mapWanted = false;
+		exitedBreak = false; 
 		numSeqs = list->getNumSeqs();
 		
 		//initialize cluster array
@@ -263,27 +265,32 @@ void HCluster::update(int row, int col, float distance){
 		smallRow = row;
 		smallCol = col;
 		smallDist = distance;
-		bool clustered = false;
 		
 		//find upmost parent of row and col
 		smallRow = getUpmostParent(smallRow);
 		smallCol = getUpmostParent(smallCol);
 	//cout << "row = " << row << " smallRow = " << smallRow <<  " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
-		
-		//are they active in the link table
-		int linkValue = makeActive(); //after this point this nodes info is active in linkTable
-	//printInfo();			
-//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
-		//can we cluster???
-		if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {
-			//printInfo();
-			updateArrayandLinkTable();
-			clusterBins();
-			clusterNames();
-			clustered = true;
-			//printInfo();
+		//you don't want to cluster with yourself
+		if (smallRow != smallCol) {
+			//are they active in the link table
+			int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+			//printInfo();			
+			//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
+			//can we cluster???
+			bool cluster = false;
+			
+			if (method == "nearest") { cluster = true;  }
+			else if (method == "average") { cout << "still working on this... " << endl; //got to figure this out 
+			}else{ //assume furthest
+				if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
+			}
+			
+			if (cluster) { 
+				updateArrayandLinkTable();
+				clusterBins();
+				clusterNames();
+			}
 		}
-		
 		//printInfo();
 	}
 	catch(exception& e) {
@@ -340,6 +347,64 @@ try {
 		exit(1);
 	}
 }
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){
+	try {
+		string firstName, secondName;
+		float distance, prevDistance;
+		vector<seqDist> sameSeqs;
+		prevDistance = -1;
+	
+		//if you are not at the beginning of the file
+		if (exitedBreak) { 
+			sameSeqs.push_back(next);
+			prevDistance = next.dist;
+			exitedBreak = false;
+		}
+	
+		//get entry
+		while (!filehandle.eof()) {
+			
+			filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+	
+			//save first one
+			if (prevDistance == -1) { prevDistance = distance; }
+			
+			map<string,int>::iterator itA = nameMap->find(firstName);
+			map<string,int>::iterator itB = nameMap->find(secondName);
+			if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+			if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+		
+			//using cutoff
+			if (distance > cutoff) { break; }
+		
+			if (distance != -1) { //-1 means skip me
+			
+				//are the distances the same
+				if (distance == prevDistance) { //save in vector
+					seqDist temp(itA->second, itB->second, distance);
+					sameSeqs.push_back(temp);
+					exitedBreak = false;
+				}else{ 
+					next.seq1 = itA->second;
+					next.seq2 = itB->second;
+					next.dist = distance;
+					exitedBreak = true;
+					break;
+				}
+			}
+		}
+		
+		//rndomize matching dists
+		random_shuffle(sameSeqs.begin(), sameSeqs.end());
+		
+		return sameSeqs;
+	}
+	catch(exception& e) {
+		errorOut(e, "HCluster", "getSeqs");
+		exit(1);
+	}
+}
 /***********************************************************************/
 
 
diff --git a/hcluster.h b/hcluster.h
index dd4c679..065d559 100644
--- a/hcluster.h
+++ b/hcluster.h
@@ -12,6 +12,7 @@
 
 
 #include "mothur.h"
+#include "nameassignment.hpp"
 
 class RAbundVector;
 class ListVector;
@@ -20,11 +21,12 @@ class ListVector;
 class HCluster {
 	
 public:
-	HCluster(RAbundVector*, ListVector*);
+	HCluster(RAbundVector*, ListVector*, string);
 	~HCluster(){};
     void update(int, int, float);
 	void setMapWanted(bool m); 
 	map<string, int> getSeqtoBin()  {  return seq2Bin;	}
+	vector<seqDist> getSeqs(ifstream&, NameAssignment*, float);
 
 protected:	
 	void clusterBins();
@@ -49,7 +51,9 @@ protected:
 	int smallCol;
 	float smallDist;
 	map<string, int> seq2Bin;
-	bool mapWanted;
+	bool mapWanted, exitedBreak;
+	seqDist next;
+	string method;
 	
 };
 
diff --git a/hclustercommand.cpp b/hclustercommand.cpp
index 3c79f64..506ba54 100644
--- a/hclustercommand.cpp
+++ b/hclustercommand.cpp
@@ -76,7 +76,7 @@ HClusterCommand::HClusterCommand(string option){
 			cutoff += (5 / (precision * 10.0));
 			
 			method = validParameter.validFile(parameters, "method", false);
-			if (method == "not found") { method = "nearest"; }
+			if (method == "not found") { method = "furthest"; }
 			
 			if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 			else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
@@ -96,7 +96,9 @@ HClusterCommand::HClusterCommand(string option){
 											
 				fileroot = getRootName(distfile);
 				
-				tag = "fn";  //until we figure out average and nearest methods
+				if (method == "furthest")		{ tag = "fn";  }
+				else if (method == "nearest")	{ tag = "nn";  }
+				else							{ tag = "an";  }
 			
 				openOutputFile(fileroot+ tag + ".sabund",	sabundFile);
 				openOutputFile(fileroot+ tag + ".rabund",	rabundFile);
@@ -119,7 +121,7 @@ void HClusterCommand::help(){
 		mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 		mothurOut("The hcluster command should be in the following format: \n");
 		mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-		mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");	
+		mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n");	
 	}
 	catch(exception& e) {
 		errorOut(e, "HClusterCommand", "help");
@@ -182,17 +184,13 @@ int HClusterCommand::execute(){
 		string firstName, secondName;
 		float distance;
 		
-		cluster = new HCluster(rabund, list);
+		cluster = new HCluster(rabund, list, method);
 		vector<seqDist> seqs; seqs.resize(1); // to start loop
-		exitedBreak = false;  //lets you know if there is a distance stored in next
-	
-		while (seqs.size() != 0){
 		
-			seqs = getSeqs(in);
-			random_shuffle(seqs.begin(), seqs.end());
-			
-			if (seqs.size() == 0) { break; } //there are no more distances
+		while (seqs.size() != 0){
 		
+			seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff);
+				
 			for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
 
 				if (print_start && isTrue(timing)) {
@@ -203,14 +201,12 @@ int HClusterCommand::execute(){
 					print_start = false;
 				}
 				
-	//cout << "before cluster update" << endl;
+	
 				if (seqs[i].seq1 != seqs[i].seq2) {
 					cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
 					
 					float rndDist = roundDist(seqs[i].dist, precision);
-		//cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;			
-					
-					
+							
 					if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
 						printData("unique");
 					}
@@ -256,11 +252,10 @@ int HClusterCommand::execute(){
 		sabundFile.close();
 		rabundFile.close();
 		listFile.close();
-		
 		delete cluster;
-		//if (isTrue(timing)) {
-			mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
-		//}
+	
+		mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
+		
 		return 0;
 	}
 	catch(exception& e) {
@@ -299,76 +294,4 @@ void HClusterCommand::printData(string label){
 
 }
 //**********************************************************************************************************************
-vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
-	try {
-		string firstName, secondName;
-		float distance, prevDistance;
-		vector<seqDist> sameSeqs;
-		prevDistance = -1;
-		
-		//if you are not at the beginning of the file
-		if (exitedBreak) { 
-			sameSeqs.push_back(next);
-			prevDistance = next.dist;
-			exitedBreak = false;
-		}
-	
-		//get entry
-		while (filehandle) {
-			
-			filehandle >> firstName >> secondName >> distance;  
-//cout << firstName << '\t' << secondName << '\t' << distance << endl;
-			gobble(filehandle);
-			
-			//save first one
-			if (prevDistance == -1) { prevDistance = distance; }
-	//cout << prevDistance << endl;	
-//if (globaldata->nameMap == NULL) { cout << "null" << endl; }
-			map<string,int>::iterator itA = globaldata->nameMap->find(firstName);
-			map<string,int>::iterator itB = globaldata->nameMap->find(secondName);
-			
-			if(itA == globaldata->nameMap->end()){
-				cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);
-			}
-			if(itB == globaldata->nameMap->end()){
-				cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);
-			}
-	//cout << "here" << endl;		
-			//using cutoff
-			if (distance > cutoff) { break; }
-			
-			if (distance != -1) { //-1 means skip me
-				
-				//are the distances the same
-				if (distance == prevDistance) { //save in vector
-					seqDist temp;
-					temp.seq1 = itA->second;
-					temp.seq2 = itB->second;
-					temp.dist = distance;
-					sameSeqs.push_back(temp);
-					exitedBreak = false;
-					//what about precision??
-					
-				}else{ 
-					next.seq1 = itA->second;
-					next.seq2 = itB->second;
-					next.dist = distance;
-					exitedBreak = true;
-					break;
-				}
-				
-			}
-		}
-
-		return sameSeqs;
-	}
-	catch(exception& e) {
-		errorOut(e, "HClusterCommand", "getSeqs");
-		exit(1);
-	}
-
-
-}
-
-//**********************************************************************************************************************
 
diff --git a/hclustercommand.h b/hclustercommand.h
index 1dd46b9..4c8fa9c 100644
--- a/hclustercommand.h
+++ b/hclustercommand.h
@@ -29,12 +29,6 @@
 //Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA 
 //Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009 
 /************************************************************/
-struct seqDist {
-	int seq1;
-	int seq2;
-	float dist;
-};
-/************************************************************/
 class HClusterCommand : public Command {
 	
 public:
@@ -52,23 +46,15 @@ private:
 	ListVector oldList;
 	ReadCluster* read;
 	
-	bool abort;
-
-	string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort;
+	bool abort, sorted, print_start;
+	string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort, showabund, timing;
 	double cutoff;
-	string showabund, timing;
 	int precision, length;
 	ofstream sabundFile, rabundFile, listFile;
-	//ifstream in;
-	seqDist next;
-	bool exitedBreak, sorted;
-
-	bool print_start;
 	time_t start;
 	unsigned long loops;
 	
 	void printData(string label);
-	vector<seqDist> getSeqs(ifstream&);
 };
 
 /************************************************************/
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp
index d5b84a8..07e3776 100644
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -8,7 +8,6 @@
  */
 
 #include "mgclustercommand.h"
-#include "readcluster.h"
 
 //**********************************************************************************************************************
 MGClusterCommand::MGClusterCommand(string option){
@@ -138,6 +137,15 @@ int MGClusterCommand::execute(){
 		start = time(NULL);
 		oldList = *list;
 		
+		if (method == "furthest")		{ tag = "fn";  }
+		else if (method == "nearest")	{ tag = "nn";  }
+		else							{ tag = "an";  }
+		
+		//open output files
+		openOutputFile(fileroot+ tag + ".list",  listFile);
+		openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
+		openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
+		
 		if (!hclusterWanted) {
 			//get distmatrix and overlap
 			SparseMatrix* distMatrix = read->getDistMatrix();
@@ -148,14 +156,8 @@ int MGClusterCommand::execute(){
 			if (method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, distMatrix); }
 			else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, distMatrix); }
 			else if(method == "average"){	cluster = new AverageLinkage(rabund, list, distMatrix);	}
-			tag = cluster->getTag();
 			cluster->setMapWanted(true);
 			
-			//open output files
-			openOutputFile(fileroot+ tag + ".list",  listFile);
-			openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-			openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-			
 			//cluster using cluster classes
 			while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
 				
@@ -218,24 +220,16 @@ int MGClusterCommand::execute(){
 			sortHclusterFiles(distFile, overlapFile);
 		
 			//create cluster
-			hcluster = new HCluster(rabund, list);
+			hcluster = new HCluster(rabund, list, method);
 			hcluster->setMapWanted(true);
 			
-			//open output files
-			openOutputFile(fileroot+ tag + ".list",  listFile);
-			openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-			openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-			
-			vector<DistNode> seqs; seqs.resize(1); // to start loop
-			exitedBreak = false;  //lets you know if there is a distance stored in next
-			
+			vector<seqDist> seqs; seqs.resize(1); // to start loop
 			ifstream inHcluster;
 			openInputFile(distFile, inHcluster);
 
 			while (seqs.size() != 0){
 		
-				seqs = getSeqs(inHcluster);
-				random_shuffle(seqs.begin(), seqs.end());
+				seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff);
 				
 				for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
 					
@@ -345,7 +339,7 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
 		while (!done) {
 			
 			//get next overlap
-			DistNode overlapNode;
+			seqDist overlapNode;
 			if (!hclusterWanted) {  
 				if (count < overlapMatrix.size()) { //do we have another node in the matrix
 					overlapNode = overlapMatrix[count];
@@ -415,89 +409,21 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
 	try {
 		//sort distFile
-		ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff); 	
-		readCluster->setFormat("column");
-		readCluster->read(nameMap);
-		string sortedDistFile = readCluster->getOutputFile();
-		ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-		
+		string sortedDistFile = sortFile(unsortedDist);
 		remove(unsortedDist.c_str());  //delete unsorted file
 		distFile = sortedDistFile;
-		delete extralist;
-		delete readCluster;
 		
 		//sort overlap file
-		readCluster = new ReadCluster(unsortedOverlap, cutoff); 	
-		readCluster->setFormat("column");
-		readCluster->read(nameMap);
-		string sortedOverlapFile = readCluster->getOutputFile();
-		extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-		
+		string sortedOverlapFile = sortFile(unsortedOverlap);
 		remove(unsortedOverlap.c_str());  //delete unsorted file
 		overlapFile = sortedOverlapFile;
-		delete extralist;
-		delete readCluster;
 	}
 	catch(exception& e) {
 		errorOut(e, "MGClusterCommand", "sortHclusterFiles");
 		exit(1);
 	}
 }
-//**********************************************************************************************************************
-vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
-	try {
-		string firstName, secondName;
-		float distance, prevDistance;
-		vector<DistNode> sameSeqs;
-		prevDistance = -1;
-	
-		//if you are not at the beginning of the file
-		if (exitedBreak) { 
-			sameSeqs.push_back(next);
-			prevDistance = next.dist;
-			exitedBreak = false;
-		}
-	
-		//get entry
-		while (!filehandle.eof()) {
-			
-			filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
-	
-			//save first one
-			if (prevDistance == -1) { prevDistance = distance; }
-			
-			map<string,int>::iterator itA = nameMap->find(firstName);
-			map<string,int>::iterator itB = nameMap->find(secondName);
-			if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-			if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
-		
-			//using cutoff
-			if (distance > cutoff) { break; }
-		
-			if (distance != -1) { //-1 means skip me
-			
-				//are the distances the same
-				if (distance == prevDistance) { //save in vector
-					DistNode temp(itA->second, itB->second, distance);
-					sameSeqs.push_back(temp);
-					exitedBreak = false;
-				}else{ 
-					next.seq1 = itA->second;
-					next.seq2 = itB->second;
-					next.dist = distance;
-					exitedBreak = true;
-					break;
-				}
-			}
-		}
-		
-		return sameSeqs;
-	}
-	catch(exception& e) {
-		errorOut(e, "MGClusterCommand", "getSeqs");
-		exit(1);
-	}
-}
+
 //**********************************************************************************************************************
 
 
diff --git a/mgclustercommand.h b/mgclustercommand.h
index 9aa86d4..2ac80c9 100644
--- a/mgclustercommand.h
+++ b/mgclustercommand.h
@@ -36,20 +36,18 @@ private:
 	HCluster* hcluster;
 	ListVector* list;
 	ListVector oldList;
-	vector<DistNode> overlapMatrix;
-	DistNode next;
-
+	vector<seqDist> overlapMatrix;
 	
 	string blastfile, method, namefile, overlapFile, distFile;
 	ofstream sabundFile, rabundFile, listFile;
 	float cutoff, penalty;
 	int precision, length, precisionLength;
-	bool abort, minWanted, hclusterWanted, exitedBreak, merge;
+	bool abort, minWanted, hclusterWanted, merge;
 	
 	void printData(ListVector*);
 	ListVector* mergeOPFs(map<string, int>, float);
 	void sortHclusterFiles(string, string);
-	vector<DistNode> getSeqs(ifstream&);
+	vector<seqDist> getSeqs(ifstream&);
 
 };
 
diff --git a/mothur.h b/mothur.h
index f839328..cf6ae69 100644
--- a/mothur.h
+++ b/mothur.h
@@ -100,7 +100,15 @@ struct clusterNode {
 	int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node
 	clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {};
 };
-
+/************************************************************/
+struct seqDist {
+	int seq1;
+	int seq2;
+	float dist;
+	seqDist() {}
+	seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+	~seqDist() {}
+};
 /***********************************************************************/
 
 // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2
@@ -775,6 +783,92 @@ inline bool anyLabelsToProcess(string label, set<string>& userLabels, string err
 	}
 }
 
+/**************************************************************************************************/
+inline void appendFiles(string temp, string filename) {
+	try{
+		ofstream output;
+		ifstream input;
+	
+		//open output file in append mode
+		openOutputFileAppend(filename, output);
+		openInputFile(temp, input);
+		
+		while(char c = input.get()){
+			if(input.eof())		{	break;			}
+			else				{	output << c;	}
+		}
+		
+		input.close();
+		output.close();
+	}
+	catch(exception& e) {
+		errorOut(e, "mothur", "appendFiles");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+inline string sortFile(string distFile){
+	try {	
+		string outfile = getRootName(distFile) + "sorted.dist";
+		
+		//if you can, use the unix sort since its been optimized for years
+		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+			string command = "sort -n -k +3 " + distFile + " -o " + outfile;
+			system(command.c_str());
+		#else //you are stuck with my best attempt...
+			//windows sort does not have a way to specify a column, only a character in the line
+			//since we cannot assume that the distance will always be at the the same character location on each line
+			//due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
+		
+			//read in file line by file and put distance first
+			string tempDistFile = distFile + ".temp";
+			ifstream input;
+			ofstream output;
+			openInputFile(distFile, input);
+			openOutputFile(tempDistFile, output);
+
+			string firstName, secondName;
+			float dist;
+			while (input) {
+				input >> firstName >> secondName >> dist;
+				output << dist << '\t' << firstName << '\t' << secondName << endl;
+				gobble(input);
+			}
+			input.close();
+			output.close();
+		
+	
+			//sort using windows sort
+			string tempOutfile = outfile + ".temp";
+			string command = "sort " + tempDistFile + " /O " + tempOutfile;
+			system(command.c_str());
+		
+			//read in sorted file and put distance at end again
+			ifstream input2;
+			openInputFile(tempOutfile, input2);
+			openOutputFile(outfile, output);
+		
+			while (input2) {
+				input2 >> dist >> firstName >> secondName;
+				output << firstName << '\t' << secondName << '\t' << dist << endl;
+				gobble(input2);
+			}
+			input2.close();
+			output.close();
+		
+			//remove temp files
+			remove(tempDistFile.c_str());
+			remove(tempOutfile.c_str());
+		#endif
+		
+		return outfile;
+	}
+	catch(exception& e) {
+		errorOut(e, "mothur", "sortFile");
+		exit(1);
+	}
+}
 /**************************************************************************************************/
 #endif
 
diff --git a/preclustercommand.cpp b/preclustercommand.cpp
new file mode 100644
index 0000000..479a862
--- /dev/null
+++ b/preclustercommand.cpp
@@ -0,0 +1,269 @@
+/*
+ *  preclustercommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 12/21/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "preclustercommand.h"
+
+//**********************************************************************************************************************
+inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
+//**********************************************************************************************************************
+
+PreClusterCommand::PreClusterCommand(string option){
+	try {
+		abort = false;
+		
+		//allow user to run help
+		if(option == "help") { help(); abort = true; }
+		
+		else {
+			//valid paramters for this command
+			string Array[] =  {"fasta", "name", "diffs"};
+			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+			
+			OptionParser parser(option);
+			map<string, string> parameters = parser.getParameters();
+			
+			ValidParameters validParameter;
+		
+			//check to make sure all parameters are valid for command
+			for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
+				if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
+			}
+			
+			//check for required parameters
+			fastafile = validParameter.validFile(parameters, "fasta", true);
+			if (fastafile == "not found") { mothurOut("fasta is a required parameter for the pre.cluster command."); mothurOutEndLine(); abort = true; }
+			else if (fastafile == "not open") { abort = true; }	
+			
+			//check for optional parameter and set defaults
+			// ...at some point should added some additional type checking...
+			namefile = validParameter.validFile(parameters, "name", true);
+			if (namefile == "not found") { namefile =  "";  }
+			else if (namefile == "not open") { abort = true; }	
+			else {  readNameFile();  }
+			
+			string temp	= validParameter.validFile(parameters, "diffs", false);				if(temp == "not found"){	temp = "1"; }
+			convert(temp, diffs); 
+		}
+				
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "PreClusterCommand");
+		exit(1);
+	}
+}
+
+//**********************************************************************************************************************
+PreClusterCommand::~PreClusterCommand(){}	
+//**********************************************************************************************************************
+
+void PreClusterCommand::help(){
+	try {
+		mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n");
+		mothurOut("The pre.cluster command outputs a new fasta and name file.\n");
+		mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n");
+		mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n");
+		mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n");
+		mothurOut("The pre.cluster command should be in the following format: \n");
+		mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n");
+		mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n");
+		mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "help");
+		exit(1);
+	}
+}
+//**********************************************************************************************************************
+
+int PreClusterCommand::execute(){
+	try {
+		
+		if (abort == true) { return 0; }
+		
+		//reads fasta file and return number of seqs
+		int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
+	
+		if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0;  }
+		if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0;  }
+		
+		//clear sizes since you only needed this info to build the alignSeqs seqPNode structs
+		sizes.clear();
+	
+		//sort seqs by number of identical seqs
+		sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+	
+		//go through active list and cluster everthing you can, until all nodes are inactive
+		//taking advantage of the fact that maps are already sorted
+		map<string, bool>::iterator itActive;
+		map<string, bool>::iterator it2Active;
+		int count = 0;
+		
+		for (int i = 0; i < alignSeqs.size(); i++) {
+		
+			//are you active
+			itActive = active.find(alignSeqs[i].seq.getName());
+			
+			if (itActive != active.end()) {  //this sequence has not been merged yet
+			
+				//try to merge it with all smaller seqs
+				for (int j = i; j < alignSeqs.size(); j++) {
+					
+					if (i != j) {
+						//are you active
+						it2Active = active.find(alignSeqs[j].seq.getName());
+						if (it2Active != active.end()) {  //this sequence has not been merged yet
+							//are you within "diff" bases
+							int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+							
+							if (mismatch <= diffs) {
+								//merge
+								names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
+							
+								//remove from active list
+								active.erase(it2Active);
+								
+								//set numIdentical to 0, so you only print the representative seqs in the fasta file
+								alignSeqs[j].numIdentical = 0;
+								count++;
+							}
+						}//end if j active
+					}//end if i != j
+				}//end for loop
+				
+				//remove from active list 
+				active.erase(itActive);
+			}//end if active i
+		}
+	
+		string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
+		string newNamesFile = getRootName(fastafile) + "precluster.names";
+		
+		printData(newFastaFile, newNamesFile);
+		
+		mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
+		mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); 
+		
+		return 0;
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "execute");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+int PreClusterCommand::readSeqs(){
+	try {
+		ifstream inFasta;
+		openInputFile(fastafile, inFasta);
+		length = 0;
+		map<string, string>::iterator it;
+				
+		while (!inFasta.eof()) {
+			Sequence temp(inFasta);  //read seq
+			
+			if (temp.getName() != "") {  
+				if (namefile != "") {
+					//make sure fasta and name files match
+					it = names.find(temp.getName());
+					if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+				}else { sizes[temp.getName()] = 1; }
+				
+				seqPNode tempNode(sizes[temp.getName()], temp);
+				alignSeqs.push_back(tempNode); 
+				active[temp.getName()] = true;
+			}
+			gobble(inFasta);
+		}
+		inFasta.close();
+		
+		if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
+		
+		return alignSeqs.size();
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "readSeqs");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void PreClusterCommand::readNameFile(){
+	try {
+		ifstream in;
+		openInputFile(namefile, in);
+		string firstCol, secondCol;
+				
+		while (!in.eof()) {
+			in >> firstCol >> secondCol; gobble(in);
+			names[firstCol] = secondCol;
+			int size = 1;
+			while (secondCol.find_first_of(',') != -1) { 
+				size++;
+				secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
+			}
+			sizes[firstCol] = size;
+		}
+		in.close();
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "readNameFile");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+int PreClusterCommand::calcMisMatches(string seq1, string seq2){
+	try {
+		int numBad = 0;
+		
+		for (int i = 0; i < seq1.length(); i++) {
+			//do they match
+			if (seq1[i] != seq2[i]) { numBad++; }
+			if (numBad > diffs) { return length;  } //to far to cluster
+		}
+		
+		return numBad;
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "calcMisMatches");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void PreClusterCommand::printData(string newfasta, string newname){
+	try {
+		ofstream outFasta;
+		ofstream outNames;
+		openOutputFile(newfasta, outFasta);
+		openOutputFile(newname, outNames);
+				
+		map<string, string>::iterator itNames;
+		
+		for (int i = 0; i < alignSeqs.size(); i++) {
+			if (alignSeqs[i].numIdentical != 0) {
+				alignSeqs[i].seq.printSequence(outFasta); 
+				
+				itNames = names.find(alignSeqs[i].seq.getName());
+				
+				outNames << itNames->first << '\t' << itNames->second << endl;
+			}
+		}
+		
+		outFasta.close();
+		outNames.close();
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "PreClusterCommand", "printData");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+
+
diff --git a/preclustercommand.h b/preclustercommand.h
new file mode 100644
index 0000000..bb666cd
--- /dev/null
+++ b/preclustercommand.h
@@ -0,0 +1,59 @@
+#ifndef PRECLUSTERCOMMAND_H
+#define PRECLUSTERCOMMAND_H
+
+
+/*
+ *  preclustercommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 12/21/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "command.hpp"
+#include "sequence.hpp"
+
+/************************************************************/
+struct seqPNode {
+	int numIdentical;
+	Sequence seq;
+	seqPNode() {}
+	seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {}
+	~seqPNode() {}
+};
+/************************************************************/
+
+class PreClusterCommand : public Command {
+	
+public:
+	PreClusterCommand(string);	
+	~PreClusterCommand();
+	int execute();	
+	void help();
+	
+private:
+	int diffs, length;
+	bool abort;
+	string fastafile, namefile;
+	vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
+	map<string, string> names; //represents the names file first column maps to second column
+	map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
+	map<string, bool> active; //maps sequence name to whether it has already been merged or not.
+	
+	int readSeqs();
+	int calcMisMatches(string, string);
+	void readNameFile();
+	void printData(string, string); //fasta filename, names file name
+};
+
+/************************************************************/
+
+
+
+
+
+#endif
+
+
diff --git a/readblast.cpp b/readblast.cpp
index 7a092f9..a6d23e9 100644
--- a/readblast.cpp
+++ b/readblast.cpp
@@ -12,7 +12,7 @@
 
 //********************************************************************************************************************
 //sorts lowest to highest
-inline bool compareOverlap(DistNode left, DistNode right){
+inline bool compareOverlap(seqDist left, seqDist right){
 	return (left.dist < right.dist);	
 } 
 /*********************************************************************************************/
@@ -91,7 +91,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
 				//if there is a valid overlap, add it
 				if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
 					if (!hclusterWanted) {
-						DistNode overlapValue(itA->second, itB->second, thisoverlap);
+						seqDist overlapValue(itA->second, itB->second, thisoverlap);
 						overlap.push_back(overlapValue);
 					}else {
 						outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
@@ -137,7 +137,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
 						//if there is a valid overlap, add it
 						if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
 							if (!hclusterWanted) {
-								DistNode overlapValue(itA->second, itB->second, thisoverlap);
+								seqDist overlapValue(itA->second, itB->second, thisoverlap);
 								//cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
 								overlap.push_back(overlapValue);
 							}else {
@@ -201,7 +201,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
 						//if there is a valid overlap, add it
 						if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
 							if (!hclusterWanted) {
-								DistNode overlapValue(itA->second, itB->second, thisoverlap);
+								seqDist overlapValue(itA->second, itB->second, thisoverlap);
 								overlap.push_back(overlapValue);
 							}else {
 								outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
diff --git a/readblast.h b/readblast.h
index 0b47b59..128c144 100644
--- a/readblast.h
+++ b/readblast.h
@@ -13,15 +13,6 @@
 #include "sparsematrix.hpp"
 #include "nameassignment.hpp"
 
-/****************************************************************************************/
-struct DistNode {
-	int seq1;
-	int seq2;
-	float dist;
-	DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
-	DistNode() {}
-	~DistNode() {}
-};
 /****************************************************************************************/
 
 //Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction.
@@ -36,7 +27,7 @@ public:
 	
 	void read(NameAssignment*);
 	SparseMatrix* getDistMatrix()		{	return matrix;		}
-	vector<DistNode> getOverlapMatrix()	{	return overlap;		}
+	vector<seqDist> getOverlapMatrix()	{	return overlap;		}
 	string getOverlapFile()				{	return overlapFile;	}
 	string getDistFile()				{	return distFile;	}
 	
@@ -48,7 +39,7 @@ private:
 	bool hclusterWanted;
 	
 	SparseMatrix* matrix;
-	vector<DistNode> overlap;
+	vector<seqDist> overlap;
 	
 	void readNames(NameAssignment*);
 };
diff --git a/readcluster.cpp b/readcluster.cpp
index 233b8e1..11b8d01 100644
--- a/readcluster.cpp
+++ b/readcluster.cpp
@@ -25,7 +25,7 @@ void ReadCluster::read(NameAssignment* nameMap){
 		if (format == "phylip") { convertPhylip2Column(nameMap); }
 		else { list = new ListVector(nameMap->getListVector());  }
 		
-		createHClusterFile();
+		OutPutFile = sortFile(distFile);
 			
 	}
 	catch(exception& e) {
@@ -35,71 +35,6 @@ void ReadCluster::read(NameAssignment* nameMap){
 }
 /***********************************************************************/
 
-void ReadCluster::createHClusterFile(){
-	try {	
-		string outfile = getRootName(distFile) + "sorted.dist";
-		
-		//if you can, use the unix sort since its been optimized for years
-		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-			string command = "sort -n -k +3 " + distFile + " -o " + outfile;
-			system(command.c_str());
-		#else //you are stuck with my best attempt...
-			//windows sort does not have a way to specify a column, only a character in the line
-			//since we cannot assume that the distance will always be at the the same character location on each line
-			//due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
-		
-			//read in file line by file and put distance first
-			string tempDistFile = distFile + ".temp";
-			ifstream input;
-			ofstream output;
-			openInputFile(distFile, input);
-			openOutputFile(tempDistFile, output);
-
-			string firstName, secondName;
-			float dist;
-			while (input) {
-				input >> firstName >> secondName >> dist;
-				output << dist << '\t' << firstName << '\t' << secondName << endl;
-				gobble(input);
-			}
-			input.close();
-			output.close();
-		
-	
-			//sort using windows sort
-			string tempOutfile = outfile + ".temp";
-			string command = "sort " + tempDistFile + " /O " + tempOutfile;
-			system(command.c_str());
-		
-			//read in sorted file and put distance at end again
-			ifstream input2;
-			openInputFile(tempOutfile, input2);
-			openOutputFile(outfile, output);
-		
-			while (input2) {
-				input2 >> dist >> firstName >> secondName;
-				output << firstName << '\t' << secondName << '\t' << dist << endl;
-				gobble(input2);
-			}
-			input2.close();
-			output.close();
-		
-			//remove temp files
-			remove(tempDistFile.c_str());
-			remove(tempOutfile.c_str());
-		#endif
-		
-		OutPutFile = outfile;
-	}
-	catch(exception& e) {
-		errorOut(e, "ReadCluster", "createHClusterFile");
-		exit(1);
-	}
-}
-
-
-/***********************************************************************/
-
 void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
 	try {	
 		//convert phylip file to column file
diff --git a/readcluster.h b/readcluster.h
index 9e62e92..bcd910c 100644
--- a/readcluster.h
+++ b/readcluster.h
@@ -27,7 +27,6 @@ public:
 	string getOutputFile() { return OutPutFile; }
 	void setFormat(string f) { format = f;	}
 	ListVector* getListVector()		{	return list;	}
-	//NameAssignment* getNameMap()	{	return nameMap;	}
 	
 private:
 	GlobalData* globaldata;
@@ -35,9 +34,7 @@ private:
 	string OutPutFile, format;
 	ListVector* list;
 	float cutoff;
-	//NameAssignment* nameMap;
 	
-	void createHClusterFile();
 	void convertPhylip2Column(NameAssignment*);
 };
 
-- 
2.39.2