X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=hcluster.cpp;h=f8f48095b2ec55a8dce91d55881271f2f1837218;hp=259036f0f02a42beb95926f9ce59a4140164840e;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=832d53a9dfac6b1795735eec643d8cf627b0d8e3

diff --git a/hcluster.cpp b/hcluster.cpp
index 259036f..f8f4809 100644
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -10,11 +10,11 @@
 #include "hcluster.h"
 #include "rabundvector.hpp"
 #include "listvector.hpp"
-#include "sparsematrix.hpp"
 
 /***********************************************************************/
-HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
 	try {
+		m = MothurOut::getInstance();
 		mapWanted = false;
 		exitedBreak = false; 
 		numSeqs = list->getNumSeqs();
@@ -25,14 +25,14 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAs
 			clusterArray.push_back(temp);
 		}
 		
-		if (method != "average") {
-			openInputFile(distfile, filehandle);
+		if ((method == "furthest") || (method == "nearest")) {
+			m->openInputFile(distfile, filehandle);
 		}else{  
 			processFile();  
 		}
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "HCluster");
+		m->errorOut(e, "HCluster", "HCluster");
 		exit(1);
 	}
 }
@@ -49,7 +49,7 @@ void HCluster::clusterBins(){
 		//cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "clusterBins");
+		m->errorOut(e, "HCluster", "clusterBins");
 		exit(1);
 	}
 
@@ -71,7 +71,7 @@ void HCluster::clusterNames(){
 
     }
 	catch(exception& e) {
-		errorOut(e, "HCluster", "clusterNames");
+		m->errorOut(e, "HCluster", "clusterNames");
 		exit(1);
 	}
 
@@ -86,7 +86,7 @@ int HCluster::getUpmostParent(int node){
 		return node;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getUpmostParent");
+		m->errorOut(e, "HCluster", "getUpmostParent");
 		exit(1);
 	}
 }
@@ -116,7 +116,7 @@ void HCluster::printInfo(){
 		
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getUpmostParent");
+		m->errorOut(e, "HCluster", "getUpmostParent");
 		exit(1);
 	}
 }
@@ -184,7 +184,7 @@ int HCluster::makeActive() {
 		return linkValue;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "makeActive");
+		m->errorOut(e, "HCluster", "makeActive");
 		exit(1);
 	}
 }
@@ -253,12 +253,12 @@ void HCluster::updateArrayandLinkTable() {
 		}
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "updateArrayandLinkTable");
+		m->errorOut(e, "HCluster", "updateArrayandLinkTable");
 		exit(1);
 	}
 }
 /***********************************************************************/
-bool HCluster::update(int row, int col, float distance){
+double HCluster::update(int row, int col, float distance){
 	try {
 		bool cluster = false;
 		smallRow = row;
@@ -272,7 +272,7 @@ bool HCluster::update(int row, int col, float distance){
 		//you don't want to cluster with yourself
 		if (smallRow != smallCol) {
 			
-			if (method != "average") {
+			if ((method == "furthest") || (method == "nearest")) {
 				//can we cluster???
 				if (method == "nearest") { cluster = true;  }
 				else{ //assume furthest
@@ -295,18 +295,18 @@ bool HCluster::update(int row, int col, float distance){
 			}
 		}
 		
-		return cluster;
+		return cutoff;
 		//printInfo();
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "update");
+		m->errorOut(e, "HCluster", "update");
 		exit(1);
 	}
 }
 /***********************************************************************/
-void HCluster::setMapWanted(bool m)  {  
+void HCluster::setMapWanted(bool ms)  {  
 	try {
-		mapWanted = m;
+		mapWanted = ms;
 		
 		//initialize map
 		for (int i = 0; i < list->getNumBins(); i++) {
@@ -327,7 +327,7 @@ void HCluster::setMapWanted(bool m)  {
 		
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "setMapWanted");
+		m->errorOut(e, "HCluster", "setMapWanted");
 		exit(1);
 	}
 }
@@ -348,7 +348,7 @@ try {
 		seq2Bin[names] = clusterArray[smallCol].smallChild;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "updateMap");
+		m->errorOut(e, "HCluster", "updateMap");
 		exit(1);
 	}
 }
@@ -357,7 +357,7 @@ vector<seqDist> HCluster::getSeqs(){
 	try {
 		vector<seqDist> sameSeqs;
 		
-		if(method != "average") {
+		if ((method == "furthest") || (method == "nearest")) {
 			sameSeqs = getSeqsFNNN();
 		}else{
 			sameSeqs = getSeqsAN();	
@@ -366,7 +366,7 @@ vector<seqDist> HCluster::getSeqs(){
 		return sameSeqs;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getSeqs");
+		m->errorOut(e, "HCluster", "getSeqs");
 		exit(1);
 	}
 }
@@ -388,15 +388,15 @@ vector<seqDist> HCluster::getSeqsFNNN(){
 		//get entry
 		while (!filehandle.eof()) {
 			
-			filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+			filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
 	
 			//save first one
 			if (prevDistance == -1) { prevDistance = distance; }
 			
 			map<string,int>::iterator itA = nameMap->find(firstName);
 			map<string,int>::iterator itB = nameMap->find(secondName);
-			if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-			if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+			if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+			if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 		
 			//using cutoff
 			if (distance > cutoff) { break; }
@@ -424,12 +424,11 @@ vector<seqDist> HCluster::getSeqsFNNN(){
 		return sameSeqs;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getSeqsFNNN");
+		m->errorOut(e, "HCluster", "getSeqsFNNN");
 		exit(1);
 	}
 }
 //**********************************************************************************************************************
-//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
 vector<seqDist> HCluster::getSeqsAN(){
 	try {
 		int firstName, secondName;
@@ -437,7 +436,7 @@ vector<seqDist> HCluster::getSeqsAN(){
 		vector<seqDist> sameSeqs;
 		prevDistance = -1;
 		
-		openInputFile(distfile, filehandle, "no error"); 
+		m->openInputFile(distfile, filehandle, "no error"); 
 		
 		//is the smallest value in mergedMin or the distfile?
 		float mergedMinDist = 10000;
@@ -445,13 +444,13 @@ vector<seqDist> HCluster::getSeqsAN(){
 		if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
 			
 		if (!filehandle.eof()) {  
-			filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
+			filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
 			//save first one
 			if (prevDistance == -1) { prevDistance = distance; } 
 			if (distance != -1) { //-1 means skip me
 				seqDist temp(firstName, secondName, distance);
 				sameSeqs.push_back(temp);
-			}
+			}else{ distance = 10000; }
 		}
 		
 		if (mergedMinDist < distance) { //get minimum distance from mergedMin
@@ -468,7 +467,7 @@ vector<seqDist> HCluster::getSeqsAN(){
 			//get entry
 			while (!filehandle.eof()) {
 				
-				filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+				filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
 				
 				if (prevDistance == -1) { prevDistance = distance; }
 				
@@ -495,26 +494,29 @@ vector<seqDist> HCluster::getSeqsAN(){
 		return temp;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getSeqsAN");
+		m->errorOut(e, "HCluster", "getSeqsAN");
 		exit(1);
 	}
 }
 
 /***********************************************************************/
-void HCluster::combineFile() {
+int HCluster::combineFile() {
 	try {
-		int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
-		char* inputBuffer;
-		inputBuffer = new char[bufferSize];
-		size_t numRead;
+		//int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
+		//char* inputBuffer;
+		//inputBuffer = new char[bufferSize];
+		//size_t numRead;
 		
 		string tempDistFile = distfile + ".temp";
 		ofstream out;
-		openOutputFile(tempDistFile, out);
+		m->openOutputFile(tempDistFile, out);
 		
-		FILE* in;
-		in = fopen(distfile.c_str(), "rb");
+		//FILE* in;
+		//in = fopen(distfile.c_str(), "rb");
 	
+		ifstream in;
+		m->openInputFile(distfile, in, "no error");
+		
 		int first, second;
 		float dist;
 		
@@ -524,28 +526,32 @@ void HCluster::combineFile() {
 				
 		//go through file pulling out distances related to rows merging
 		//if mergedMin contains distances add those back into file
-		bool done = false;
-		partialDist = "";
-		while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+		//bool done = false;
+		//partialDist = "";
+		//while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 //cout << "number of char read = " << numRead << endl;
 //cout << inputBuffer << endl;
-			if (numRead < bufferSize) { done = true; }
+			//if (numRead < bufferSize) { done = true; }
 			
 			//parse input into individual distances
-			int spot = 0;
-			string outputString = "";
-			while(spot < numRead) {
+			//int spot = 0;
+			//string outputString = "";
+			//while(spot < numRead) {
 	//cout << "spot = " << spot << endl;
-			   seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+			 //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
 			   
 			   //you read a partial distance
-			   if (nextDist.seq1 == -1) { break;  }
-			   
-			   first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+			  // if (nextDist.seq1 == -1) { break;  }
+			while (!in.eof()) {
+				//first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
 	//cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;		   
 			   //since file is sorted and mergedMin is sorted 
 			   //you can put the smallest distance from each through the code below and keep the file sorted
 			   
+			   in >> first >> second >> dist; m->gobble(in);
+			   
+			   if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
+			   
 			   //while there are still values in mergedMin that are smaller than the distance read from file
 			   while (count < mergedMin.size())  {
 			   
@@ -563,7 +569,10 @@ void HCluster::combineFile() {
 						}else if (mergedMin[count].seq2 == smallRow) {
 							smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 						}else { //if no, write to temp file
-							outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+							//outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+							//if (mergedMin[count].dist < cutoff) { 
+								out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+							//}
 						}
 						count++;
 					}else{   break;	}
@@ -582,14 +591,18 @@ void HCluster::combineFile() {
 					smallRowColValues[0][first] = dist;
 			   
 			   }else { //if no, write to temp file
-					outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+					//outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+				   //if (dist < cutoff) {
+					   out << first << '\t' << second << '\t' << dist << endl;
+				   //}
 			   }
 			}
 			
-			out << outputString;
-			if(done) { break; }
-		}
-		fclose(in);
+			//out << outputString;
+			//if(done) { break; }
+		//}
+		//fclose(in);
+		in.close();
 		
 		//if values in mergedMin are larger than the the largest in file then
 		while (count < mergedMin.size())  {  
@@ -606,7 +619,9 @@ void HCluster::combineFile() {
 				smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 				
 			}else { //if no, write to temp file
-				out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+				//if (mergedMin[count].dist < cutoff) {
+					out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+				//}
 			}
 			count++;
 		}
@@ -614,9 +629,10 @@ void HCluster::combineFile() {
 		mergedMin.clear();
 			
 		//rename tempfile to distfile
-		remove(distfile.c_str());
+		m->mothurRemove(distfile);
 		rename(tempDistFile.c_str(), distfile.c_str());
-		
+//cout << "remove = "<< renameOK << " rename = " << ok << endl;	
+
 		//merge clustered rows averaging the distances
 		map<int, float>::iterator itMerge;
 		map<int, float>::iterator it2Merge;
@@ -626,25 +642,40 @@ void HCluster::combineFile() {
 			
 			float average;
 			if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
-				//weighted average
-				int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
-				average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+				//average
+				if (method == "average") {
+					int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+					average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+				}else { //weighted
+					average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;				
+				}
+				
 				smallRowColValues[1].erase(it2Merge);
 				
 				seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
 				mergedMin.push_back(temp);
+			}else {  
+				//can't find value so update cutoff
+				if (cutoff > itMerge->second) { cutoff = itMerge->second; }
 			}
 		}
-
+		
+		//update cutoff
+		for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) {	
+			if (cutoff > itMerge->second) { cutoff = itMerge->second; }
+		}
+		
 		//sort merged values
 		sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);	
+		
+		return 0;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "combineFile");
+		m->errorOut(e, "HCluster", "combineFile");
 		exit(1);
 	}
 }
-/***********************************************************************/
+/***********************************************************************
 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 	try {
 		seqDist next;
@@ -674,7 +705,7 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 			if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
 				gotDist = true;
 				
-				//gobble space
+				//m->gobble space
 				while (index < size) {		
 					if (isspace(buffer[index])) { index++; }
 					else { break; }		
@@ -716,32 +747,33 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 		return next;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "getNextDist");
+		m->errorOut(e, "HCluster", "getNextDist");
 		exit(1);
 	}
 }
-/***********************************************************************/
-void HCluster::processFile() {
+***********************************************************************/
+int HCluster::processFile() {
 	try {
 		string firstName, secondName;
 		float distance;
 		
 		ifstream in;
-		openInputFile(distfile, in);
+		m->openInputFile(distfile, in, "no error");
 		
 		ofstream out;
 		string outTemp = distfile + ".temp";
-		openOutputFile(outTemp, out);
+		m->openOutputFile(outTemp, out);
 	
 		//get entry
 		while (!in.eof()) {
+			if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
 			
-			in >> firstName >> secondName >> distance;    gobble(in);		
+			in >> firstName >> secondName >> distance;    m->gobble(in);		
 			
 			map<string,int>::iterator itA = nameMap->find(firstName);
 			map<string,int>::iterator itB = nameMap->find(secondName);
-			if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-			if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+			if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+			if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 		
 			//using cutoff
 			if (distance > cutoff) { break; }
@@ -754,11 +786,13 @@ void HCluster::processFile() {
 		in.close();
 		out.close();
 		
-		remove(distfile.c_str());
+		m->mothurRemove(distfile);
 		rename(outTemp.c_str(), distfile.c_str());
+		
+		return 0;
 	}
 	catch(exception& e) {
-		errorOut(e, "HCluster", "processFile");
+		m->errorOut(e, "HCluster", "processFile");
 		exit(1);
 	}
 }