From: westcott <westcott>
Date: Thu, 30 Jul 2009 12:03:32 +0000 (+0000)
Subject: Thallinger changes to cluster command.
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1b9d0a66e4737f31d16824fe93944880b9edc530;p=mothur.git

Thallinger changes to cluster command.
---

diff --git a/averagelinkage.cpp b/averagelinkage.cpp
index 26283ff..7a4cb88 100644
--- a/averagelinkage.cpp
+++ b/averagelinkage.cpp
@@ -12,68 +12,44 @@
 /***********************************************************************/
 
 AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
-{}
+	Cluster(rav, lv, dm)
+{
+	saveRow = -1;
+	saveCol = -1;
+}
+
+
+/***********************************************************************/
+//This function returns the tag of the method.
+string AverageLinkage::getTag() {
+	return("an");
+}
+
 
 /***********************************************************************/
-//THis function clusters based on the average method
-void AverageLinkage::update(){
-	try{
-		getRowColCells();		
-	
-		vector<int> found(nColCells, 0);
-	
-		int rowBin = rabund->get(smallRow);
-		int colBin = rabund->get(smallCol);
-		int totalBin = rowBin + colBin;
-	
-		for(int i=1;i<nRowCells;i++){
-		
-			int search;
-		
-			if(rowCells[i]->row == smallRow){
-				search = rowCells[i]->column;
-			}
-			else{
-				search = rowCells[i]->row;
-			}
-		
-			for(int j=1;j<nColCells;j++){
-			
-				if(colCells[j]->row == search || colCells[j]->column == search){
-					colCells[j]->dist = (colBin * colCells[j]->dist + rowBin * rowCells[i]->dist) / totalBin;		
-				
-					found[j] = 1;
-				
-					if(colCells[j]->vectorMap != NULL){
-						*(colCells[j]->vectorMap) = NULL;
-						colCells[j]->vectorMap = NULL;
-					}
-				
-					break;
-				}
-			
-			}			
-			dMatrix->rmCell(rowCells[i]);
-		}	
-	
-		clusterBins();
-		clusterNames();
-	
-		for(int i=0;i<nColCells;i++){
-			if(found[i] == 0){
-				dMatrix->rmCell(colCells[i]);
-			}
+//This function updates the distance based on the average linkage method.
+bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+	try {
+		if ((saveRow != smallRow) || (saveCol != smallCol)) {
+			rowBin = rabund->get(smallRow);
+			colBin = rabund->get(smallCol);
+			totalBin = rowBin + colBin;
+			saveRow = smallRow;
+			saveCol = smallCol;
 		}
+
+		colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
+		return(true);
 	}
 	catch(exception& e) {
-		errorOut(e, "AverageLinkage", "update");
+		errorOut(e, "AverageLinkage", "updateDistance");
 		exit(1);
 	}
 }
 
 /***********************************************************************/
 
-#endif
 
+/***********************************************************************/
 
+#endif
diff --git a/cluster.cpp b/cluster.cpp
index b9754be..6fd4631 100644
--- a/cluster.cpp
+++ b/cluster.cpp
@@ -17,41 +17,58 @@
 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
 rabund(rav), list(lv), dMatrix(dm)
 {
+/*
+	cout << "sizeof(MatData): " << sizeof(MatData) << endl;
+	cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
+
+	int nCells = dMatrix->getNNodes();
+	time_t start = time(NULL);
+
+	MatVec matvec = MatVec(nCells); 
+	int i = 0;
+	for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+		matvec[i++] = currentCell;
+	}
+	for (i= matvec.size();i>0;i--) {
+		dMatrix->rmCell(matvec[i-1]);
+	}
+	MatData it = dMatrix->begin(); 
+	while (it != dMatrix->end()) { 
+		it = dMatrix->rmCell(it);
+	}
+	cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
+    exit(0);
+	MatData it = dMatrix->begin();
+	cout << it->row << "/" << it->column << "/" << it->dist << endl;
+	dMatrix->rmCell(dMatrix->begin());
+	cout << it->row << "/" << it->column << "/" << it->dist << endl;
+	exit(0);
+*/
+
+	// Create a data structure to quickly access the PCell information
+	// for a certain sequence. It consists of a vector of lists, where 
+	// a list contains pointers (iterators) to the all distances related
+	// to a certain sequence. The Vector is accessed via the index of a 
+	// sequence in the distance matrix.
+	seqVec = vector<MatVec>(lv->size());
+	for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+		seqVec[currentCell->row].push_back(currentCell);
+		seqVec[currentCell->column].push_back(currentCell);
+	}
 }
 
 /***********************************************************************/
 
-void Cluster::getRowColCells(){
+void Cluster::getRowColCells() {
 	try {
 		PCell* smallCell = dMatrix->getSmallestCell();	//find the smallest cell - this routine should probably not be in the SpMat class
 	
-		smallRow = smallCell->row;		//get its row
-		smallCol = smallCell->column;	//get its column
-		smallDist = smallCell->dist;	//get the smallest distance
-	
-		rowCells.clear();
-		colCells.clear();
-		
-		for(MatData currentCell=dMatrix->begin();currentCell!=dMatrix->end();currentCell++){
-		
-			if(&*currentCell == smallCell){				//put the smallest cell first
-				rowCells.insert(rowCells.begin(), currentCell);
-				colCells.insert(colCells.begin(), currentCell);
-			}
-			else if(currentCell->row == smallRow){
-				rowCells.push_back(currentCell);
-			}
-			else if(currentCell->column == smallRow){
-				rowCells.push_back(currentCell);
-			}
-			else if(currentCell->row == smallCol){
-				colCells.push_back(currentCell);
-			}
-			else if(currentCell->column == smallCol){
-				colCells.push_back(currentCell);
-			}
-		}
+		smallRow = smallCell->row;		// get its row
+		smallCol = smallCell->column;	// get its column
+		smallDist = smallCell->dist;	// get the smallest distance
 	
+		rowCells = seqVec[smallRow];    // all distances related to the row index
+		colCells = seqVec[smallCol];    // all distances related to the column index
 		nRowCells = rowCells.size();
 		nColCells = colCells.size();
 	}
@@ -59,39 +76,153 @@ void Cluster::getRowColCells(){
 		errorOut(e, "Cluster", "getRowColCells");
 		exit(1);
 	}
+
 }
 
+// Remove the specified cell from the seqVec and from the sparse
+// matrix
+void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
+{
+	ull drow = cell->row;
+	ull dcol = cell->column;
+	if (((vrow >=0) && (drow != smallRow)) ||
+		((vcol >=0) && (dcol != smallCol))) {
+		ull dtemp = drow;
+		drow = dcol;
+		dcol = dtemp;
+	}
+
+	ull crow;
+	ull ccol;
+	int nCells;
+	if (vrow < 0) {
+		nCells = seqVec[drow].size();
+		for (vrow=0; vrow<nCells;vrow++) {
+			crow = seqVec[drow][vrow]->row;
+			ccol = seqVec[drow][vrow]->column;
+			if (((crow == drow) && (ccol == dcol)) ||
+				((ccol == drow) && (crow == dcol))) {
+				break;
+			}
+		}
+	}
+	seqVec[drow].erase(seqVec[drow].begin()+vrow);
+	if (vcol < 0) {
+		nCells = seqVec[dcol].size();
+		for (vcol=0; vcol<nCells;vcol++) {
+			crow = seqVec[dcol][vcol]->row;
+			ccol = seqVec[dcol][vcol]->column;
+			if (((crow == drow) && (ccol == dcol)) ||
+				((ccol == drow) && (crow == dcol))) {
+				break;
+			}
+		}
+	}
+	seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
+	if (rmMatrix) {
+		dMatrix->rmCell(cell);
+	}
+}
+
+
 /***********************************************************************/
 
 void Cluster::clusterBins(){
 	try {
-	
+	//	cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
+
 		rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));	
 		rabund->set(smallRow, 0);	
 		rabund->setLabel(toString(smallDist));
 
+	//	cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
 	}
 	catch(exception& e) {
 		errorOut(e, "Cluster", "clusterBins");
 		exit(1);
 	}
+
+
 }
 
 /***********************************************************************/
 
 void Cluster::clusterNames(){
 	try {
-	
+	//	cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
+
 		list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
 		list->set(smallRow, "");	
 		list->setLabel(toString(smallDist));
 	
+	//	cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
     }
 	catch(exception& e) {
 		errorOut(e, "Cluster", "clusterNames");
 		exit(1);
 	}
+
 }
 
+/***********************************************************************/
+//This function clusters based on the method of the derived class
+//At the moment only average and complete linkage are covered, because
+//single linkage uses a different approach.
+void Cluster::update(){
+	try {
+		getRowColCells();	
+	
+		vector<int> found(nColCells, 0);
+		int search;
+		bool changed;
+
+		// The vector has to be traversed in reverse order to preserve the index
+		// for faster removal in removeCell()
+		for (int i=nRowCells-1;i>=0;i--) {
+			if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
+				if (rowCells[i]->row == smallRow) {
+					search = rowCells[i]->column;
+				} else {
+					search = rowCells[i]->row;
+				}
+		
+				for (int j=0;j<nColCells;j++) {
+					if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+						if (colCells[j]->row == search || colCells[j]->column == search) {
+							found[j] = 1;
+							changed = updateDistance(colCells[j], rowCells[i]);
+							// If the cell's distance changed and it had the same distance as 
+							// the smallest distance, invalidate the mins vector in SparseMatrix
+							if (changed) {
+								if (colCells[j]->vectorMap != NULL) {
+									*(colCells[j]->vectorMap) = NULL;
+									colCells[j]->vectorMap = NULL;
+								}
+							}
+							break;
+						}
+					}
+				}
+				removeCell(rowCells[i], i , -1);
+			}
+		}
+		clusterBins();
+		clusterNames();
+
+		// Special handling for singlelinkage case, not sure whether this
+		// could be avoided
+		for (int i=nColCells-1;i>=0;i--) {
+			if (found[i] == 0) {
+				removeCell(colCells[i], -1, i);
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "Cluster", "update");
+		exit(1);
+	}
+}
+
+
 /***********************************************************************/
 
diff --git a/cluster.hpp b/cluster.hpp
index de86f85..99d8b7d 100644
--- a/cluster.hpp
+++ b/cluster.hpp
@@ -7,18 +7,22 @@
 
 class RAbundVector;
 class ListVector;
-class SparseMatrix;
 
-typedef list<PCell>::iterator MatData;
+typedef vector<MatData> MatVec;
 
 class Cluster {
 	
 public:
 	Cluster(RAbundVector*, ListVector*, SparseMatrix*);
-	virtual void update() = 0;
-	
+    virtual void update();
+	virtual string getTag() = 0;
+
 protected:	
 	void getRowColCells();
+    void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true);
+
+	virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0;
+
 	virtual void clusterBins();
 	virtual void clusterNames();
 	
@@ -29,8 +33,10 @@ protected:
 	int smallRow;
 	int smallCol;
 	float smallDist;
-	vector<MatData> rowCells;
-	vector<MatData> colCells;
+	
+	vector<MatVec> seqVec;		// contains vectors of cells related to a certain sequence
+	MatVec rowCells;
+	MatVec colCells;
 	ull nRowCells;
 	ull nColCells;
 };
@@ -40,7 +46,8 @@ protected:
 class CompleteLinkage : public Cluster {
 public:
 	CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-	void update();
+	bool updateDistance(MatData& colCell, MatData& rowCell);
+	string getTag();
 	
 private:
 		
@@ -51,7 +58,9 @@ private:
 class SingleLinkage : public Cluster {
 public:
 	SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-	void update();
+    void update();
+	bool updateDistance(MatData& colCell, MatData& rowCell);
+	string getTag();
 	
 private:
 		
@@ -62,10 +71,16 @@ private:
 class AverageLinkage : public Cluster {
 public:
 	AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-	void update();
+	bool updateDistance(MatData& colCell, MatData& rowCell);
+	string getTag();
 	
 private:
-		
+	int saveRow;
+	int saveCol;
+	int rowBin;
+	int colBin;
+	int totalBin;
+
 };
 
 /***********************************************************************/
diff --git a/clustercommand.cpp b/clustercommand.cpp
index 593f262..d3b9bbf 100644
--- a/clustercommand.cpp
+++ b/clustercommand.cpp
@@ -21,7 +21,7 @@ ClusterCommand::ClusterCommand(string option){
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"cutoff","precision","method"};
+			string Array[] =  {"cutoff","precision","method","showabund","timing"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -31,33 +31,43 @@ ClusterCommand::ClusterCommand(string option){
 		
 			//check to make sure all parameters are valid for command
 			for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
-				if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+				if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
+					abort = true;
+				}
 			}
 			
 			//error checking to make sure they read a distance file
 			if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
-				mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine(); abort = true;
+				mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine();
+				abort = true;
 			} 
 		
 			//check for optional parameter and set defaults
 			// ...at some point should added some additional type checking...
 			//get user cutoff and precision or use defaults
 			string temp;
-			temp = validParameter.validFile(parameters, "precision", false);		if (temp == "not found") { temp = "100"; }
+			temp = validParameter.validFile(parameters, "precision", false);
+			if (temp == "not found") { temp = "100"; }
 			//saves precision legnth for formatting below
 			length = temp.length();
 			convert(temp, precision); 
 			
-			temp = validParameter.validFile(parameters, "cutoff", false);			if (temp == "not found") { temp = "10"; }
+			temp = validParameter.validFile(parameters, "cutoff", false);
+			if (temp == "not found") { temp = "10"; }
 			convert(temp, cutoff); 
 			cutoff += (5 / (precision * 10.0));
 			
-			method = validParameter.validFile(parameters, "method", false);			if (method == "not found") { method = "furthest"; }
-
+			method = validParameter.validFile(parameters, "method", false);
+			if (method == "not found") { method = "furthest"; }
 			
 			if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 			else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
 
+			showabund = validParameter.validFile(parameters, "showabund", false);
+			if (showabund == "not found") { showabund = "T"; }
+
+			timing = validParameter.validFile(parameters, "timing", false);
+			if (timing == "not found") { timing = "F"; }
 			
 			if (abort == false) {
 			
@@ -70,25 +80,21 @@ ClusterCommand::ClusterCommand(string option){
 				}
 				
 				//create cluster
-				if(method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix);	tag = "fn";	}
-				else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix);		tag = "nn";	}
-				else if(method == "average"){	cluster = new AverageLinkage(rabund, list, matrix);		tag = "an";	}
-				else						{	mothurOut("error - not recognized method"); mothurOutEndLine();	abort = true;	}	
-				
+				if (method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix); }
+				else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix); }
+				else if(method == "average"){	cluster = new AverageLinkage(rabund, list, matrix);	}
+				tag = cluster->getTag();
+
 				fileroot = getRootName(globaldata->inputFileName);
 			
 				openOutputFile(fileroot+ tag + ".sabund",	sabundFile);
 				openOutputFile(fileroot+ tag + ".rabund",	rabundFile);
 				openOutputFile(fileroot+ tag + ".list",		listFile);
-				
-				
 			}
-
 		}
-		
 	}
 	catch(exception& e) {
-		errorOut(e, "ClusterCommand", "ClusterCommand");		
+		errorOut(e, "ClusterCommand", "ClusterCommand");
 		exit(1);
 	}
 }
@@ -97,11 +103,11 @@ ClusterCommand::ClusterCommand(string option){
 
 void ClusterCommand::help(){
 	try {
-		 mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
-		 mothurOut("The cluster command parameter options are method, cuttoff and precision. No parameters are required.\n");
-		 mothurOut("The cluster command should be in the following format: \n");
-		 mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-		 mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");
+		mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
+		mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
+		mothurOut("The cluster command should be in the following format: \n");
+		mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+		mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");	
 	}
 	catch(exception& e) {
 		errorOut(e, "ClusterCommand", "help");
@@ -125,16 +131,28 @@ int ClusterCommand::execute(){
 	
 		if (abort == true) {	return 0;	}
 		
+		time_t estart = time(NULL);
+		int ndist = matrix->getNNodes();
 		float previousDist = 0.00000;
 		float rndPreviousDist = 0.00000;
 		oldRAbund = *rabund;
 		oldList = *list;
+
+		print_start = true;
+		start = time(NULL);
+		loops = 0;
 		
-		float x;
-		x=0.1;
-		toString(x, 2);
-	
-		while(matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+		while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+			if (print_start && isTrue(timing)) {
+				mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" 
+					+ toString(roundDist(matrix->getSmallDist(), precision)) 
+					+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+				cout.flush();
+				print_start = false;
+			}
+
+			loops++;
+
 			cluster->update();
 			float dist = matrix->getSmallDist();
 			float rndDist = roundDist(dist, precision);
@@ -151,6 +169,13 @@ int ClusterCommand::execute(){
 			oldRAbund = *rabund;
 			oldList = *list;
 		}
+
+		if (print_start && isTrue(timing)) {
+			mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) 
+					 + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+			cout.flush();
+	 		print_start = false;
+		}
 	
 		if(previousDist <= 0.0000){
 			printData("unique");
@@ -174,7 +199,9 @@ int ClusterCommand::execute(){
 		sabundFile.close();
 		rabundFile.close();
 		listFile.close();
-		
+		if (isTrue(timing)) {
+			mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster " + toString(ndist) + " distances"); mothurOutEndLine();
+		}
 		return 0;
 	}
 	catch(exception& e) {
@@ -187,8 +214,18 @@ int ClusterCommand::execute(){
 
 void ClusterCommand::printData(string label){
 	try {
+		if (isTrue(timing)) {
+			mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) 
+		     + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine();
+		}
+		print_start = true;
+		loops = 0;
+		start = time(NULL);
+
 		oldRAbund.setLabel(label);
-		oldRAbund.getSAbundVector().print(cout);
+		if (isTrue(showabund)) {
+			oldRAbund.getSAbundVector().print(cout);
+		}
 		oldRAbund.print(rabundFile);
 		oldRAbund.getSAbundVector().print(sabundFile);
 	
@@ -199,5 +236,7 @@ void ClusterCommand::printData(string label){
 		errorOut(e, "ClusterCommand", "printData");
 		exit(1);
 	}
+
+
 }
 //**********************************************************************************************************************
diff --git a/clustercommand.h b/clustercommand.h
index 44ce6f9..ad9fe85 100644
--- a/clustercommand.h
+++ b/clustercommand.h
@@ -26,8 +26,6 @@
 	The cluster() command outputs three files *.list, *.rabund, and *.sabund.   */
 
 
-class GlobalData;
-
 class ClusterCommand : public Command {
 	
 public:
@@ -49,8 +47,13 @@ private:
 
 	string method, fileroot, tag;
 	double cutoff;
+	string showabund, timing;
 	int precision, length;
 	ofstream sabundFile, rabundFile, listFile;
+
+	bool print_start;
+	time_t start;
+	unsigned long loops;
 	
 	void printData(string label);
 };
diff --git a/completelinkage.cpp b/completelinkage.cpp
index f69753b..09ba963 100644
--- a/completelinkage.cpp
+++ b/completelinkage.cpp
@@ -1,64 +1,32 @@
 
 #include "cluster.hpp"
-#include "mothur.h"
 
 /***********************************************************************/
 
 CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
+	Cluster(rav, lv, dm)
 {}
 
 /***********************************************************************/
-//This function clusters based on the furthest neighbor method.
-void CompleteLinkage::update(){
+//This function returns the tag of the method.
+string CompleteLinkage::getTag() {
+	return("fn");
+}
+
+
+/***********************************************************************/
+//This function updates the distance based on the furthest neighbor method.
+bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
 	try {
-		getRowColCells();	
-	
-		vector<int> found(nColCells, 0);
-	
-		for(int i=1;i<nRowCells;i++){
-		
-			int search;
-		
-			if(rowCells[i]->row == smallRow){
-				search = rowCells[i]->column;
-			}
-			else{
-				search = rowCells[i]->row;
-			}
-		
-			for(int j=1;j<nColCells;j++){
-			
-				if(colCells[j]->row == search || colCells[j]->column == search){
-				
-					if(colCells[j]->dist < rowCells[i]->dist){
-						colCells[j]->dist = rowCells[i]->dist;
-					
-						if(colCells[j]->vectorMap != NULL){
-							*(colCells[j]->vectorMap) = NULL;
-							colCells[j]->vectorMap = NULL;
-						}
-					
-					}
-				
-					found[j] = 1;
-					break;
-				}
-			}
-			dMatrix->rmCell(rowCells[i]);
-		
+		bool changed = false;
+		if (colCell->dist < rowCell->dist) {
+			colCell->dist = rowCell->dist;
+			changed = true;
 		}	
-		clusterBins();
-		clusterNames();
-	
-		for(int i=0;i<nColCells;i++){
-			if(found[i] == 0){
-				dMatrix->rmCell(colCells[i]);
-			}
-		}
+		return(changed);
 	}
 	catch(exception& e) {
-		errorOut(e, "CompleteLinkage", "update");
+		errorOut(e, "CompleteLinkage", "updateDistance");
 		exit(1);
 	}
 }
diff --git a/decalc.cpp b/decalc.cpp
index 1a73494..203482f 100644
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -59,11 +59,13 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& t
 		
 		int front = 0;
 		for (int i = 0; i < q.length(); i++) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
 			if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
 		}
-		
+//cout << endl << endl;		
 		int back = 0;		
 		for (int i = q.length(); i >= 0; i--) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
 			if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
 		}
 		
@@ -97,10 +99,10 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
 		//count bases
 		int numBases = 0;
 		for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
-			
+//cout << "num Bases = " << numBases << endl;			
 		//save start of seq
 		win.push_back(front);
-		
+//cout << front << '\t';		
 		//move ahead increment bases at a time until all bases are in a window
 		int countBases = 0;
 		int totalBases = 0;  //used to eliminate window of blanks at end of sequence
@@ -109,18 +111,27 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
 		for (int m = front; m < (back - size) ; m++) {
 				
 			//count number of bases you see
-			if (isalpha(seq[m])) { countBases++; totalBases++;  }
+			if (isalpha(seq[m])) { countBases++;  }
 				
 			//if you have seen enough bases to make a new window
 			if (countBases >= increment) {
+				//total bases is the number of bases in a window already.
+				totalBases += countBases;
+//cout << "total bases = " << totalBases << endl;
 				win.push_back(m);  //save spot in alignment
+//cout << m << '\t';
 				countBases = 0;				//reset bases you've seen in this window
 			}
 				
 			//no need to continue if all your bases are in a window
 			if (totalBases == numBases) {   break;  }
 		}
-			
+	
+
+		//get last window if needed
+		if (totalBases < numBases) {   win.push_back(back-size);  cout << back-size << endl;}
+//cout << endl << endl;
+		
 		return win;
 	
 	}
@@ -239,6 +250,12 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
 		
 		openOutputFile(freqfile, outFreq);
 		
+		string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
+		int precision = length.length() - 1;
+		
+		//format output
+		outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
+		
 		//at each position in the sequence
 		for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
 			
@@ -261,11 +278,8 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
 			int highest = 0;
 			for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }		}
 			
-			float highFreq;
-			//subtract gaps to "ignore them"
-			if ( (seqs.size() - gaps) == 0 ) {  highFreq = 1.0;  }			
-			else { highFreq = highest / (float) (seqs.size() - gaps);	 }
-						
+			float highFreq = highest / (float) (seqs.size());	
+			
 			float Pi;
 			Pi =  (highFreq - 0.25) / 0.75; 
 			
@@ -273,10 +287,9 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
 			if (Pi < 0) { Pi = 0.0; }
 			
 			//saves this for later
-			outFreq << i+1 << '\t' << highFreq << endl;
-			
+			outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
+	
 			if (h.count(i) > 0) {
-	cout << i+1 << '\t' << highFreq << endl;
 				prob.push_back(Pi); 
 			}
 		}
@@ -387,6 +400,52 @@ vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vecto
 		exit(1);
 	}
 }
+//***************************************************************************************************************
+void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
+	try {
+		
+	
+		for (int i = 0; i < quantiles.size(); i++) {
+		
+			//find mean of this quantile score
+			float average = findAverage(quantiles[i]);
+			
+			vector<float> newQuanI;
+			//look at each value in quantiles to see if it is an outlier
+			for (int j = 0; j < quantiles[i].size(); j++) {
+				
+				float highCutoff, lowCutOff;
+				
+				
+			
+			}
+		
+		}
+
+	}
+	catch(exception& e) {
+		errorOut(e, "DeCalculator", "removeObviousOutliers");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+float DeCalculator::findAverage(vector<float> myVector) {
+	try{
+		
+		float total = 0.0;
+		for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
+		
+		float average = total / (float) myVector.size();
+		
+		return average;
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "DeCalculator", "findAverage");
+		exit(1);
+	}
+}
 
 //***************************************************************************************************************
 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
@@ -402,7 +461,8 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
 		for (int j = 0; j < obs.size(); j++) {   obsAverage += obs[j];	}
 		obsAverage = obsAverage / (float) obs.size();
 //cout << "sum ai / m = " << probAverage << endl;		
-//cout << "sum oi / m = " << obsAverage << endl;
+//cout << "sum oi / m = " << obsAverage << endl;	
+	
 		float coef = obsAverage / probAverage;
 						
 		return coef;
diff --git a/decalc.h b/decalc.h
index 9309c37..1579d86 100644
--- a/decalc.h
+++ b/decalc.h
@@ -31,6 +31,7 @@ class DeCalculator {
 		void setMask(string m); 
 		void runMask(Sequence*);
 		void trimSeqs(Sequence*, Sequence*, map<int, int>&);
+		void removeObviousOutliers(vector< vector<float> >&);
 		vector<float> calcFreq(vector<Sequence*>, string);
 		vector<int> findWindows(Sequence*, int, int, int&, int);
 		vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
@@ -42,6 +43,7 @@ class DeCalculator {
 		vector< vector<float> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
 		
 	private:
+		float findAverage(vector<float> myVector);
 		string seqMask;
 		set<int> h;
 
diff --git a/pintail.cpp b/pintail.cpp
index b7e5b5f..bca7df9 100644
--- a/pintail.cpp
+++ b/pintail.cpp
@@ -34,6 +34,11 @@ void Pintail::print(ostream& out) {
 		for (int i = 0; i < querySeqs.size(); i++) {
 			
 			int index = ceil(deviation[i]);
+float quan = 2.64 * log10(deviation[i]) + 1.46;
+cout << "dist = " << index << endl;
+cout << "de = "  << DE[i] << endl;
+cout << "mallard quantile = " << quan << endl;
+cout << "my quantile = " << quantiles[index][4] << endl;
 			
 			//is your DE value higher than the 95%
 			string chimera;
@@ -158,7 +163,7 @@ void Pintail::getChimeras() {
 		}else				{   probabilityProfile = readFreq();			  }
 
 		//make P into Q
-		for (int i = 0; i < probabilityProfile.size(); i++)  {	probabilityProfile[i] = 1 - probabilityProfile[i];	 cout << i << '\t' << probabilityProfile[i] << endl;  }
+		for (int i = 0; i < probabilityProfile.size(); i++)  {	probabilityProfile[i] = 1 - probabilityProfile[i];   }  //cout << i << '\t' << probabilityProfile[i] << endl;
 		mothurOut("Done."); mothurOutEndLine();
 		
 		//mask querys
@@ -270,6 +275,9 @@ cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
 				quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
 			}else {		createProcessesQuan();		}
 			
+			
+			decalc->removeObviousOutliers(quantiles);
+			
 			ofstream out4;
 			string o = getRootName(templateFile) + "quan";
 			
@@ -344,7 +352,7 @@ vector<float> Pintail::readFreq() {
 			
 			in >> pos >> num;
 			
-			if (h.count(pos-1) > 0) {
+			if (h.count(pos) > 0) {
 				float Pi;
 				Pi =  (num - 0.25) / 0.75; 
 			
diff --git a/readdistcommand.cpp b/readdistcommand.cpp
index 088f711..579f848 100644
--- a/readdistcommand.cpp
+++ b/readdistcommand.cpp
@@ -70,7 +70,7 @@ ReadDistCommand::ReadDistCommand(string option){
 			else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a read.dist command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
 		
 			if (columnfile != "") {
-				if (namefile == "") {  mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+				if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
 			}
 		
 			//check for optional parameter and set defaults
@@ -129,7 +129,7 @@ void ReadDistCommand::help(){
 		mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
 		mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
 		mothurOut("For this use the read.dist command should be in the following format: \n");
-		mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n");
+		mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use.  \n");
 		mothurOut("Note: No spaces between parameter labels (i.e. phylip), '=' and parameters (i.e.yourPhylipfile).\n\n");
 	}
 	catch(exception& e) {
@@ -141,6 +141,9 @@ void ReadDistCommand::help(){
 //**********************************************************************************************************************
 
 ReadDistCommand::~ReadDistCommand(){
+	if (abort == false) {
+		if (format != "matrix") { delete read; delete nameMap; }
+	}
 }
 
 //**********************************************************************************************************************
@@ -148,6 +151,9 @@ int ReadDistCommand::execute(){
 	try {
 		
 		if (abort == true) {	return 0;	}
+
+		time_t start = time(NULL);
+		size_t numDists = 0;
 		
 		if (format == "matrix") {
 			ifstream in;
@@ -157,7 +163,8 @@ int ReadDistCommand::execute(){
 			//memory leak prevention
 			if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix;  }
 			globaldata->gMatrix = matrix; //save matrix for coverage commands
-		}else {
+			numDists = matrix->getSizes()[1];
+		} else {
 			read->read(nameMap);
 			//to prevent memory leak
 
@@ -166,10 +173,36 @@ int ReadDistCommand::execute(){
 
 			if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix;  }
 			globaldata->gSparseMatrix = read->getMatrix();
-			delete read; delete nameMap;
+			numDists = globaldata->gSparseMatrix->getNNodes();
+ 
+      int lines = cutoff / (1.0/precision);
+      vector<float> dist_cutoff(lines+1,0);
+			for (int i = 0; i <= lines;i++) {	
+      	dist_cutoff[i] = (i + 0.5) / precision; 
+      } 
+      vector<int> dist_count(lines+1,0);
+      list<PCell>::iterator currentCell;
+      SparseMatrix* smatrix = globaldata->gSparseMatrix;
+  		for (currentCell = smatrix->begin(); currentCell != smatrix->end(); currentCell++) {
+				for (int i = 0; i <= lines;i++) {	
+					if (currentCell->dist < dist_cutoff[i]) {
+						dist_count[i]++;
+            break;
+          }
+        }
+			}
 
+      string dist_string = "Dist:";
+      string count_string = "Count: ";
+			for (int i = 0; i <= lines;i++) {	
+      	dist_string = dist_string.append("\t").append(toString(dist_cutoff[i]));
+      	count_string = count_string.append("\t").append(toString(dist_count[i]));
+			}
+      mothurOut(dist_string); mothurOutEndLine(); mothurOut(count_string); mothurOutEndLine();
 		}
+		mothurOut("It took " + toString(time(NULL) - start) + " secs to read " + toString(numDists) + " distances (cutoff: " + toString(cutoff) + ")"); mothurOutEndLine();
 		return 0;
+		
 	}
 	catch(exception& e) {
 		errorOut(e, "ReadDistCommand", "execute");
diff --git a/singlelinkage.cpp b/singlelinkage.cpp
index 5332e90..c27b14e 100644
--- a/singlelinkage.cpp
+++ b/singlelinkage.cpp
@@ -9,54 +9,75 @@ SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm
 Cluster(rav, lv, dm)
 {}
 
+
 /***********************************************************************/
-//This function clusters based on nearest method.
-void SingleLinkage::update(){
+//This function returns the tag of the method.
+string SingleLinkage::getTag() {
+	return("nn");
+}
+
+/***********************************************************************/
+//This function clusters based on the single linkage method.
+void  SingleLinkage::update(){
 	try {
-		getRowColCells();		
+		getRowColCells();	
 	
-		for(int i=1;i<nRowCells;i++){
-		
-			int search;
-		
-			if(rowCells[i]->row == smallRow){
-				search = rowCells[i]->column;
-			}
-			else{
-				search = rowCells[i]->row;
-			}
+		vector<bool> deleted(nRowCells, false);
+		int rowInd;
+		int search;
+		bool changed;
+
+		// The vector has to be traversed in reverse order to preserve the index
+		// for faster removal in removeCell()
+		for (int i=nRowCells-1;i>=0;i--) {
+			if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) {
+				rowInd = i;   // The index of the smallest distance cell in rowCells
+			} else {
+				if (rowCells[i]->row == smallRow) {
+					search = rowCells[i]->column;
+				} else {
+					search = rowCells[i]->row;
+				}
 		
-			for(int j=1;j<nColCells;j++){
-			
-				if(colCells[j]->row == search || colCells[j]->column == search){
-				
-					if(colCells[j]->dist > rowCells[i]->dist){
-						colCells[j]->dist = rowCells[i]->dist;
-					
-						if(colCells[j]->vectorMap != NULL){
-							*(colCells[j]->vectorMap) = NULL;
-							colCells[j]->vectorMap = NULL;
+				for (int j=0;j<nColCells;j++) {
+					if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+						if (colCells[j]->row == search || colCells[j]->column == search) {
+							changed = updateDistance(colCells[j], rowCells[i]);
+							// If the cell's distance changed and it had the same distance as 
+							// the smallest distance, invalidate the mins vector in SparseMatrix
+							if (changed) {
+								if (colCells[j]->vectorMap != NULL) {
+									*(colCells[j]->vectorMap) = NULL;
+									colCells[j]->vectorMap = NULL;
+								}
+							}
+							removeCell(rowCells[i], i , -1);
+							deleted[i] = true;
+							break;
 						}
-					
 					}
-					dMatrix->rmCell(rowCells[i]);
-					break;
 				}
-			}			
-		
-			if(search < smallCol){
-				rowCells[i]->row = smallCol;
-				rowCells[i]->column = search;
-			}
-			else{
-				rowCells[i]->row = search;
-				rowCells[i]->column = smallCol;
-			}
-		
-		}	
+				if (!deleted[i]) {
+					// Assign the cell to the new cluster 
+					// remove the old cell from seqVec and add the cell
+					// with the new row and column assignment again
+					removeCell(rowCells[i], i , -1, false);
+					if (search < smallCol){
+						rowCells[i]->row = smallCol;
+						rowCells[i]->column = search;
+					} else {
+						rowCells[i]->row = search;
+						rowCells[i]->column = smallCol;
+					}
+					seqVec[rowCells[i]->row].push_back(rowCells[i]);
+					seqVec[rowCells[i]->column].push_back(rowCells[i]);
+				}
+			}	
+		}
 		clusterBins();
 		clusterNames();
-		dMatrix->rmCell(rowCells[0]);
+		// remove also the cell with the smallest distance
+		removeCell(rowCells[rowInd], -1 , -1);
 	}
 	catch(exception& e) {
 		errorOut(e, "SingleLinkage", "update");
@@ -64,4 +85,20 @@ void SingleLinkage::update(){
 	}
 }
 
+
+/***********************************************************************/
+//This function updates the distance based on the nearest neighbor method.
+bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+	try {
+		bool changed = false;
+		if (colCell->dist > rowCell->dist) {
+			colCell->dist = rowCell->dist;
+		}
+		return(changed);
+	}
+	catch(exception& e) {
+		errorOut(e, "SingleLinkage", "updateDistance");
+		exit(1);
+	}
+}
 /***********************************************************************/
diff --git a/sparsematrix.cpp b/sparsematrix.cpp
index 7f1c3c8..8a873f7 100644
--- a/sparsematrix.cpp
+++ b/sparsematrix.cpp
@@ -2,7 +2,6 @@
 #include "sparsematrix.hpp"
 #include "listvector.hpp"
 
-typedef list<PCell>::iterator MatData;
 
 /***********************************************************************/
 
@@ -22,15 +21,15 @@ float SparseMatrix::getSmallDist(){
 
 /***********************************************************************/
 
-void SparseMatrix::rmCell(MatData data){
+MatData SparseMatrix::rmCell(MatData data){
 	try {
 		if(data->vectorMap != NULL ){
 			*(data->vectorMap) = NULL;
 			data->vectorMap = NULL;
 		}
-		matrix.erase(data);		
+		data = matrix.erase(data);		
 		numNodes--;
-	
+		return(data);
 	//  seems like i should be updating smallDist here, but the only time we remove cells is when
 	//  clustering and the clustering algorithm updates smallDist
 	}
@@ -89,13 +88,11 @@ MatData SparseMatrix::end(){
 void SparseMatrix::print(){
 	try {
 		int index = 0;
-		
-		mothurOutEndLine();
-		mothurOut("Index\tRow\tColumn\tDistance");
-		mothurOutEndLine();
+	
+		cout << endl << "Index\tRow\tColumn\tDistance" << endl;
 	
 		for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
-			mothurOut(toString(index) + "\t" + toString(currentCell->row)  + "\t" + toString(currentCell->column) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
+			cout << index << '\t' << currentCell->row  << '\t' << currentCell->column << '\t' << currentCell->dist << endl;
 			index++;
 		}
 	}
@@ -111,10 +108,7 @@ void SparseMatrix::print(ListVector* list){
 	try {
 		int index = 0;
 	
-		mothurOutEndLine();
-		mothurOut("Index\tRow\tColumn\tDistance");
-		mothurOutEndLine();
-
+		mothurOutEndLine(); mothurOut("Index\tRow\tColumn\tDistance"); mothurOutEndLine();
 	
 		for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
 			mothurOut(toString(index) + "\t" + toString(list->get(currentCell->row))  + "\t" + toString(list->get(currentCell->column)) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
@@ -159,7 +153,7 @@ PCell* SparseMatrix::getSmallestCell(){
 				}
 
 			}
-			random_shuffle(mins.begin(), mins.end());  //randomize the order of the iterators in the mins vector
+//			random_shuffle(mins.begin(), mins.end());  //randomize the order of the iterators in the mins vector
 
 			for(int i=0;i<mins.size();i++){
 				mins[i]->vectorMap = &mins[i];  //assign vectorMap to the address for the container
diff --git a/sparsematrix.hpp b/sparsematrix.hpp
index d65771e..8eeda7f 100644
--- a/sparsematrix.hpp
+++ b/sparsematrix.hpp
@@ -20,21 +20,23 @@ class ListVector;
 
 /***********************************************************************/
 
+typedef list<PCell>::iterator MatData;
+
 class SparseMatrix {
 	
 public:
 	SparseMatrix();
 	int getNNodes();
 	void print();					//Print the contents of the matrix
-	void print(ListVector*);					//Print the contents of the matrix
+	void print(ListVector*);		//Print the contents of the matrix
 	PCell* getSmallestCell();		//Return the cell with the smallest distance
 	float getSmallDist();
 	
-	void rmCell(list<PCell>::iterator);
+	MatData rmCell(MatData);
 	void addCell(PCell);
 	void clear();
-	list<PCell>::iterator begin();
-	list<PCell>::iterator end();
+	MatData begin();
+	MatData end();
 
 private:
 	PCell* smallCell;				//The cell with the smallest distance