From: westcott <westcott>
Date: Wed, 23 Jun 2010 17:42:48 +0000 (+0000)
Subject: fixed cluster.split with average method
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3102812d94898439646131cecdb64fc542913c87;p=mothur.git

fixed cluster.split with average method
---

diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index 38e8394..deea384 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -237,6 +237,7 @@ int ClusterSplitCommand::execute(){
 		vector<string> listFileNames;
 		set<string> labels;
 		string singletonName = "";
+		double saveCutoff = cutoff;
 
 		//****************** file prep work ******************************//
 		#ifdef USE_MPI
@@ -361,6 +362,10 @@ int ClusterSplitCommand::execute(){
 			for(int i = 1; i < processors; i++) { 
 				int num = dividedNames[i].size();
 				
+				double tempCutoff;
+				MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
+				if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+				
 				//send list filenames to root process
 				for (int j = 0; j < num; j++) {  
 					int lengthList = 0;
@@ -429,6 +434,9 @@ int ClusterSplitCommand::execute(){
 			//process them
 			listFileNames = cluster(myNames, labels);
 			
+			//send cutoff
+			MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
+			
 			//send list filenames to root process
 			for (int j = 0; j < num; j++) {  
 				char tempListFileName[1024];
@@ -506,6 +514,10 @@ int ClusterSplitCommand::execute(){
 						ifstream in2;
 						openInputFile(filename, in2);
 						
+						float tempCutoff;
+						in2 >> tempCutoff; gobble(in2);
+						if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+						
 						while(!in2.eof()) {
 							string tempName;
 							in2 >> tempName; gobble(in2);
@@ -521,6 +533,8 @@ int ClusterSplitCommand::execute(){
 	#endif	
 		if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 		
+		if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
+		
 		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 		
 		//****************** merge list file and create rabund and sabund files ******************************//
@@ -592,9 +606,11 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 
 			if ((*it != "unique") && (convertTestFloat(*it, temp) == true))	{	convert(*it, temp);	}
 			else if (*it == "unique")										{	temp = -1.0;		}
-						
-			orderFloat.push_back(temp);
-			labelBin[temp] = numSingleBins; //initialize numbins 
+			
+			if (temp <= cutoff) {
+				orderFloat.push_back(temp);
+				labelBin[temp] = numSingleBins; //initialize numbins 
+			}
 		}
 	
 		//sort order
@@ -804,7 +820,8 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
 				ofstream outLabels;
 				filename = toString(getpid()) + ".temp.labels";
 				openOutputFile(filename, outLabels);
-		
+				
+				outLabels << cutoff << endl;
 				for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 					outLabels << (*it) << endl;
 				}
@@ -841,8 +858,11 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 		
 		vector<string> listFileNames;
 		
+		double smallestCutoff = cutoff;
+		
 		//cluster each distance file
 		for (int i = 0; i < distNames.size(); i++) {
+			if (m->control_pressed) { return listFileNames; }
 			
 			string thisNamefile = distNames[i].begin()->second;
 			string thisDistFile = distNames[i].begin()->first;
@@ -925,7 +945,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 					listFileNames.clear(); return listFileNames;
 				}
 		
-				cluster->update(cutoff);
+				cluster->update(saveCutoff);
 	
 				float dist = matrix->getSmallDist();
 				float rndDist;
@@ -973,8 +993,13 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 			
 			remove(thisDistFile.c_str());
 			remove(thisNamefile.c_str());
+			
+			if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  }
+			
+			if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
 		}
 		
+		cutoff = smallestCutoff;
 					
 		return listFileNames;
 	
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp
index 023f214..f8f3c78 100644
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -171,6 +171,8 @@ int MGClusterCommand::execute(){
 		
 		start = time(NULL);
 		oldList = *list;
+		map<string, int> Seq2Bin;
+		map<string, int> oldSeq2Bin;
 		
 		if (method == "furthest")		{ tag = "fn";  }
 		else if (method == "nearest")	{ tag = "nn";  }
@@ -198,6 +200,8 @@ int MGClusterCommand::execute(){
 			else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
 			else if(method == "average"){	cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method);	}
 			cluster->setMapWanted(true);
+			Seq2Bin = cluster->getSeqtoBin();
+			oldSeq2Bin = Seq2Bin;
 			
 			if (m->control_pressed) { 
 				delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
@@ -231,8 +235,8 @@ int MGClusterCommand::execute(){
 				}
 				else if(rndDist != rndPreviousDist){
 					if (merge) {
-						map<string, int> seq2Bin = cluster->getSeqtoBin();
-						ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+						Seq2Bin = cluster->getSeqtoBin();
+						ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
 						
 						if (m->control_pressed) { 
 							delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
@@ -250,8 +254,10 @@ int MGClusterCommand::execute(){
 				}
 				
 				previousDist = dist;
+	cout << "prev distance = " << previousDist << " dist = " << dist << endl;
 				rndPreviousDist = rndDist;
 				oldList = *list;
+				oldSeq2Bin = Seq2Bin;
 			}
 			
 			if(previousDist <= 0.0000){
@@ -260,8 +266,8 @@ int MGClusterCommand::execute(){
 			}
 			else if(rndPreviousDist<cutoff){
 				if (merge) {
-					map<string, int> seq2Bin = cluster->getSeqtoBin();
-					ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+					Seq2Bin = cluster->getSeqtoBin();
+					ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
 					
 					if (m->control_pressed) { 
 							delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
@@ -301,6 +307,8 @@ int MGClusterCommand::execute(){
 			//create cluster
 			hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
 			hcluster->setMapWanted(true);
+			Seq2Bin = cluster->getSeqtoBin();
+			oldSeq2Bin = Seq2Bin;
 			
 			vector<seqDist> seqs; seqs.resize(1); // to start loop
 			//ifstream inHcluster;
@@ -351,8 +359,8 @@ int MGClusterCommand::execute(){
 						}
 						else if((rndDist != rndPreviousDist)){
 							if (merge) {
-								map<string, int> seq2Bin = hcluster->getSeqtoBin();
-								ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+								Seq2Bin = hcluster->getSeqtoBin();
+								ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
 								
 								if (m->control_pressed) { 
 									delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
@@ -374,6 +382,7 @@ int MGClusterCommand::execute(){
 						previousDist = seqs[i].dist;
 						rndPreviousDist = rndDist;
 						oldList = *list;
+						oldSeq2Bin = Seq2Bin;
 					}
 				}
 			}
@@ -385,8 +394,8 @@ int MGClusterCommand::execute(){
 			}
 			else if(rndPreviousDist<cutoff){
 				if (merge) {
-					map<string, int> seq2Bin = hcluster->getSeqtoBin();
-					ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+					Seq2Bin = hcluster->getSeqtoBin();
+					ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
 					
 					if (m->control_pressed) { 
 							delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
@@ -406,8 +415,8 @@ int MGClusterCommand::execute(){
 			}
 			
 			delete hcluster;
-			remove(distFile.c_str());
-			remove(overlapFile.c_str());
+			//remove(distFile.c_str());
+			//remove(overlapFile.c_str());
 		}
 		
 		delete list; 
@@ -509,15 +518,22 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
 				//get names of seqs that overlap
 				string name1 = nameMap->get(overlapNode.seq1);
 				string name2 = nameMap->get(overlapNode.seq2);
-				
+			
 				//use binInfo to find out if they are already in the same bin
-				int binKeep = binInfo[name1];
-				int binRemove = binInfo[name2];
+				map<string, int>::iterator itBin1 = binInfo.find(name1);
+				map<string, int>::iterator itBin2 = binInfo.find(name2);
+				
+				if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
+				if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
+
+				int binKeep = itBin1->second;
+				int binRemove = itBin2->second;
 				
 				//if not merge bins and update binInfo
 				if(binKeep != binRemove) {
+		
 					//save names in old bin
-					string names = list->get(binRemove);
+					string names = newList->get(binRemove);
 					
 					//merge bins into name1s bin
 					newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
diff --git a/nameassignment.cpp b/nameassignment.cpp
index 0c30898..641be4e 100644
--- a/nameassignment.cpp
+++ b/nameassignment.cpp
@@ -87,16 +87,35 @@ void NameAssignment::print(ostream& out){
 //**********************************************************************************************************************
 
 int NameAssignment::get(string key){
-	
-	return	(*this)[key];	
-
+	try {
+		map<string, int>::iterator itGet = (*this).find(key);
+		
+		//if you can't find it
+		if (itGet == (*this).end()) { return -1; }
+		
+		return	(*this)[key];	
+	}
+	catch(exception& e) {
+		m->errorOut(e, "NameAssignment", "get");
+		exit(1);
+	}
 }
 //**********************************************************************************************************************
 
 string NameAssignment::get(int key){
+	try {
 	
-	return	reverse[key];	
-
+		map<int, string>::iterator itGet = reverse.find(key);
+	
+		if (itGet == reverse.end()) { return "not found"; }
+	
+		return	reverse[key];	
+	
+	}
+	catch(exception& e) {
+		m->errorOut(e, "NameAssignment", "get");
+		exit(1);
+	}
 }
 //**********************************************************************************************************************
 
diff --git a/readblast.cpp b/readblast.cpp
index 9979a75..4833d01 100644
--- a/readblast.cpp
+++ b/readblast.cpp
@@ -169,6 +169,7 @@ int ReadBlast::read(NameAssignment* nameMap) {
 					map<int, float>::iterator itDist;
 					for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
 						distance = 1.0 - (it->second / refScore);
+		
 						
 						//do we already have the distance calculated for b->a
 						map<string,int>::iterator itA = nameMap->find(currentRow);
@@ -176,10 +177,12 @@ int ReadBlast::read(NameAssignment* nameMap) {
 						
 						//if we have it then compare
 						if (itDist != dists[it->first].end()) {
+		if (distance < 0.0) { cout << currentRow << '\t' << nameMap->get(it->first) << '\t' << "score = " << it->second	<< " refscore = " << refScore << " distance = " << distance << " distance = " << itDist->second << endl;	}
+
 							//if you want the minimum blast score ratio, then pick max distance
 							if(minWanted) {	 distance = max(itDist->second, distance);  }
 							else{	distance = min(itDist->second, distance);  }
-							
+
 							//is this distance below cutoff
 							if (distance < cutoff) {
 								if (!hclusterWanted) {