From 4d7ffb38e091e9e7425d94e112551a3f90cacb0f Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Fri, 4 Jun 2010 18:01:04 +0000
Subject: [PATCH] sped up splitting of distance file by 2.5 times by buffering
 the read and writes.

---
 clustersplitcommand.cpp |   2 +
 groupmap.cpp            |   4 +-
 splitmatrix.cpp         | 190 +++++++++++++++++++++++++++++++++++-----
 tree.cpp                |   6 +-
 treemap.cpp             |   4 +-
 5 files changed, 176 insertions(+), 30 deletions(-)
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index d10a51f..861c4fb 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -625,6 +625,8 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 			globaldata->setNameFile(thisNamefile);
 			globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 			
+			m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
+			
 			ReadMatrix* read = new ReadColumnMatrix(thisDistFile); 	
 			read->setCutoff(cutoff);
 
diff --git a/groupmap.cpp b/groupmap.cpp
index 939cdb9..bc871c8 100644
--- a/groupmap.cpp
+++ b/groupmap.cpp
@@ -25,13 +25,13 @@
 int GroupMap::readMap() {
 		string seqName, seqGroup;
 		int error = 0;
-	
+
 		while(fileHandle){
 			fileHandle >> seqName;			//read from first column
 			fileHandle >> seqGroup;			//read from second column
 			
 			if (m->control_pressed) {  fileHandle.close();  return 1; }
-			
+	
 			setNamesOfGroups(seqGroup);
 			
 			it = groupmap.find(seqName);
diff --git a/splitmatrix.cpp b/splitmatrix.cpp
index 1c61f31..718c8a3 100644
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -49,6 +49,13 @@ int SplitMatrix::splitDistance(){
 	try {
         
 		vector<set<string> > groups;
+		
+		//for buffering the io to improve speed
+		 //allow for 10 dists to be stored, then output.
+		vector<string> outputs;
+		vector<int> numOutputs;
+		vector<bool> wroteOutPut;
+		
 		int numGroups = 0;
 
 		ofstream outFile;
@@ -120,41 +127,149 @@ int SplitMatrix::splitDistance(){
 					string fileName = distFile + "." + toString(numGroups) + ".temp";
 					outFile.open(fileName.c_str(), ios::ate);
 
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
+					string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+					outputs.push_back(tempOut);
+					numOutputs.push_back(1);
+					wroteOutPut.push_back(false);
+					
 					numGroups++;
 				}
 				else{
 					string fileName = distFile + "." + toString(groupID) + ".temp";
+					
 					if(groupID != prevGroupID){
 						outFile.close();
 						outFile.open(fileName.c_str(), ios::app);
 						prevGroupID	= groupID;
 					}
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
+					
+					//have we reached the max buffer size
+					if (numOutputs[groupID] > 10) { //write out sequence
+						outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outputs[groupID] = "";
+						numOutputs[groupID] = 0;
+						wroteOutPut[groupID] = true;
+					}else {
+						outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+						numOutputs[groupID]++;
+					}
 					
 					if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
 						string row, column, distance;
 						if(groupIDA<groupIDB){
-							string fileName = distFile + "." + toString(groupIDB) + ".temp";
-							ifstream fileB(fileName.c_str());
-							while(fileB){
-								fileB >> row >> column >> distance;
-								outFile << row << '\t' << column << '\t' << distance << endl;
-								gobble(fileB);
+			
+							numOutputs[groupID] += numOutputs[groupIDB];
+							outputs[groupID] += outputs[groupIDB];
+							
+							if (wroteOutPut[groupIDB]) {
+								string fileName = distFile + "." + toString(groupIDB) + ".temp";
+								ifstream fileB(fileName.c_str(), ios::ate);
+								
+								long size;
+								char* memblock;
+
+								size = fileB.tellg();
+				
+								fileB.seekg (0, ios::beg);
+								
+								int numRead = size / 1024;
+								int lastRead = size % 1024;
+
+								for (int i = 0; i < numRead; i++) {
+				
+									memblock = new char [1024];
+								
+									fileB.read (memblock, 1024);
+									
+									string temp = memblock;
+									outFile << temp.substr(0, 1024);
+									
+									delete memblock;
+								}
+								
+								memblock = new char [lastRead];
+								
+								fileB.read (memblock, lastRead);
+								
+								//not sure why but it will read more than lastRead char...??
+								string temp = memblock;
+								outFile << temp.substr(0, lastRead);
+								delete memblock;
+								
+								fileB.close();
+								remove(fileName.c_str());
+								
+								wroteOutPut[groupID] = true;
+								wroteOutPut[groupIDB] = false;
+							}
+							
+							if (numOutputs[groupID] != 0) {
+								outFile << outputs[groupID];
+								wroteOutPut[groupID] = true;
+								outputs[groupID] = "";
+								numOutputs[groupID] = 0;
+								
+								outputs[groupIDB] = "";
+								numOutputs[groupIDB] = 0;
 							}
-							fileB.close();
-							remove(fileName.c_str());
+							
 						}
 						else{
-							string fileName = distFile + "." + toString(groupIDA) + ".temp";
-							ifstream fileA(fileName.c_str());
-							while(fileA){
-								fileA >> row >> column >> distance;
-								outFile << row << '\t' << column << '\t' << distance << endl;
-								gobble(fileA);
+							numOutputs[groupID] += numOutputs[groupIDA];
+							outputs[groupID] += outputs[groupIDA];
+							
+							if (wroteOutPut[groupIDA]) {
+								string fileName = distFile + "." + toString(groupIDA) + ".temp";
+								ifstream fileB(fileName.c_str(), ios::ate);
+								
+								long size;
+								char* memblock;
+
+								size = fileB.tellg();
+															
+								fileB.seekg (0, ios::beg);
+								
+								int numRead = size / 1024;
+								int lastRead = size % 1024;
+
+								for (int i = 0; i < numRead; i++) {
+				
+									memblock = new char [1024];
+								
+									fileB.read (memblock, 1024);
+									string temp = memblock;
+									outFile << temp.substr(0, 1024);
+									
+									delete memblock;
+								}
+								
+								memblock = new char [lastRead];
+								
+								fileB.read (memblock, lastRead);
+								
+								//not sure why but it will read more than lastRead char...??
+								string temp = memblock;
+								outFile << temp.substr(0, lastRead);
+									
+								delete memblock;
+								
+								fileB.close();
+								remove(fileName.c_str());
+								
+								wroteOutPut[groupID] = true;
+								wroteOutPut[groupIDA] = false;
+							}
+							
+							if (numOutputs[groupID] != 0) {
+								outFile << outputs[groupID];
+								wroteOutPut[groupID] = true;
+								outputs[groupID] = "";
+								numOutputs[groupID] = 0;
+								
+								outputs[groupIDA] = "";
+								numOutputs[groupIDA] = 0;
 							}
-							fileA.close();
-							remove(fileName.c_str());
+
 						}					
 					}
 				}
@@ -163,6 +278,15 @@ int SplitMatrix::splitDistance(){
 		}
 		outFile.close();
 		dFile.close();
+		
+		for (int i = 0; i < numGroups; i++) {
+			if (numOutputs[i] > 0) {
+				string fileName = distFile + "." + toString(i) + ".temp";
+				outFile.open(fileName.c_str(), ios::app);
+				outFile << outputs[i];
+				outFile.close();
+			}
+		}
 	
 		ifstream bigNameFile(namefile.c_str());
 		if(!bigNameFile){
@@ -188,7 +312,6 @@ int SplitMatrix::splitDistance(){
 				
 				for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
 					map<string,string>::iterator nIt = nameMap.find(*gIt);
-					
 					if (nIt != nameMap.end()) {
 						smallNameFile << nIt->first << '\t' << nIt->second << endl;
 						nameMap.erase(nIt);
@@ -293,6 +416,12 @@ int SplitMatrix::splitClassify(){
 			remove((distFile + "." + toString(i) + ".temp").c_str());
 		}
 		
+		
+		//for buffering the io to improve speed
+		 //allow for 10 dists to be stored, then output.
+		vector<string> outputs;  outputs.resize(numGroups, "");
+		vector<int> numOutputs;	 numOutputs.resize(numGroups, 0);	
+		
 		//for each distance
 		while(dFile){
 			string seqA, seqB;
@@ -308,17 +437,32 @@ int SplitMatrix::splitClassify(){
 			
 			if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
 				if (it->second == it2->second) { //they are from the same group so add the distance
-					openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
-					outFile.close();
+					if (numOutputs[it->second] > 10) {
+						openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+						outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outFile.close();
+						outputs[it->second] = "";
+						numOutputs[it->second] = 0;
+					}else{
+						outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+						numOutputs[it->second]++;
+					}
 				}
 			}
 		}
 		dFile.close();
 	
-		
 		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
 			remove((namefile + "." + toString(i) + ".temp").c_str());
+			
+			//write out any remaining buffers
+			if (numOutputs[it->second] > 0) {
+				openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+				outFile << outputs[i];
+				outFile.close();
+				outputs[i] = "";
+				numOutputs[i] = 0;
+			}
 		}
 		
 		ifstream bigNameFile;
diff --git a/tree.cpp b/tree.cpp
index d6634bc..8e45981 100644
--- a/tree.cpp
+++ b/tree.cpp
@@ -714,9 +714,9 @@ void Tree::parseTreeFile() {
 		}
 		filehandle.close();
 		
-		for (int i = 0; i < globaldata->Treenames.size(); i++) {
-cout << globaldata->Treenames[i] << endl; }
-cout << "done" << endl;
+		//for (int i = 0; i < globaldata->Treenames.size(); i++) {
+//cout << globaldata->Treenames[i] << endl; }
+//cout << globaldata->Treenames.size() << endl;
 	}
 	catch(exception& e) {
 		m->errorOut(e, "Tree", "parseTreeFile");
diff --git a/treemap.cpp b/treemap.cpp
index bb62448..dcef7f7 100644
--- a/treemap.cpp
+++ b/treemap.cpp
@@ -26,10 +26,10 @@ void TreeMap::readMap() {
 		while(fileHandle){
 			fileHandle >> seqName;			//read from first column
 			fileHandle >> seqGroup;			//read from second column
-			
+
 			namesOfSeqs.push_back(seqName);
 			setNamesOfGroups(seqGroup);
-						
+					
 			treemap[seqName].groupname = seqGroup;	//store data in map
 			
 			it2 = seqsPerGroup.find(seqGroup);
-- 
2.39.5