From 4d7ffb38e091e9e7425d94e112551a3f90cacb0f Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 4 Jun 2010 18:01:04 +0000 Subject: [PATCH] sped up splitting of distance file by 2.5 times by buffering the read and writes. --- clustersplitcommand.cpp | 2 + groupmap.cpp | 4 +- splitmatrix.cpp | 190 +++++++++++++++++++++++++++++++++++----- tree.cpp | 6 +- treemap.cpp | 4 +- 5 files changed, 176 insertions(+), 30 deletions(-) diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index d10a51f..861c4fb 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -625,6 +625,8 @@ vector ClusterSplitCommand::cluster(vector< map > distNa globaldata->setNameFile(thisNamefile); globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column"); + m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine(); + ReadMatrix* read = new ReadColumnMatrix(thisDistFile); read->setCutoff(cutoff); diff --git a/groupmap.cpp b/groupmap.cpp index 939cdb9..bc871c8 100644 --- a/groupmap.cpp +++ b/groupmap.cpp @@ -25,13 +25,13 @@ int GroupMap::readMap() { string seqName, seqGroup; int error = 0; - + while(fileHandle){ fileHandle >> seqName; //read from first column fileHandle >> seqGroup; //read from second column if (m->control_pressed) { fileHandle.close(); return 1; } - + setNamesOfGroups(seqGroup); it = groupmap.find(seqName); diff --git a/splitmatrix.cpp b/splitmatrix.cpp index 1c61f31..718c8a3 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -49,6 +49,13 @@ int SplitMatrix::splitDistance(){ try { vector > groups; + + //for buffering the io to improve speed + //allow for 10 dists to be stored, then output. + vector outputs; + vector numOutputs; + vector wroteOutPut; + int numGroups = 0; ofstream outFile; @@ -120,41 +127,149 @@ int SplitMatrix::splitDistance(){ string fileName = distFile + "." + toString(numGroups) + ".temp"; outFile.open(fileName.c_str(), ios::ate); - outFile << seqA << '\t' << seqB << '\t' << dist << endl; + string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; + outputs.push_back(tempOut); + numOutputs.push_back(1); + wroteOutPut.push_back(false); + numGroups++; } else{ string fileName = distFile + "." + toString(groupID) + ".temp"; + if(groupID != prevGroupID){ outFile.close(); outFile.open(fileName.c_str(), ios::app); prevGroupID = groupID; } - outFile << seqA << '\t' << seqB << '\t' << dist << endl; + + //have we reached the max buffer size + if (numOutputs[groupID] > 10) { //write out sequence + outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl; + outputs[groupID] = ""; + numOutputs[groupID] = 0; + wroteOutPut[groupID] = true; + }else { + outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; + numOutputs[groupID]++; + } if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above string row, column, distance; if(groupIDA> row >> column >> distance; - outFile << row << '\t' << column << '\t' << distance << endl; - gobble(fileB); + + numOutputs[groupID] += numOutputs[groupIDB]; + outputs[groupID] += outputs[groupIDB]; + + if (wroteOutPut[groupIDB]) { + string fileName = distFile + "." + toString(groupIDB) + ".temp"; + ifstream fileB(fileName.c_str(), ios::ate); + + long size; + char* memblock; + + size = fileB.tellg(); + + fileB.seekg (0, ios::beg); + + int numRead = size / 1024; + int lastRead = size % 1024; + + for (int i = 0; i < numRead; i++) { + + memblock = new char [1024]; + + fileB.read (memblock, 1024); + + string temp = memblock; + outFile << temp.substr(0, 1024); + + delete memblock; + } + + memblock = new char [lastRead]; + + fileB.read (memblock, lastRead); + + //not sure why but it will read more than lastRead char...?? + string temp = memblock; + outFile << temp.substr(0, lastRead); + delete memblock; + + fileB.close(); + remove(fileName.c_str()); + + wroteOutPut[groupID] = true; + wroteOutPut[groupIDB] = false; + } + + if (numOutputs[groupID] != 0) { + outFile << outputs[groupID]; + wroteOutPut[groupID] = true; + outputs[groupID] = ""; + numOutputs[groupID] = 0; + + outputs[groupIDB] = ""; + numOutputs[groupIDB] = 0; } - fileB.close(); - remove(fileName.c_str()); + } else{ - string fileName = distFile + "." + toString(groupIDA) + ".temp"; - ifstream fileA(fileName.c_str()); - while(fileA){ - fileA >> row >> column >> distance; - outFile << row << '\t' << column << '\t' << distance << endl; - gobble(fileA); + numOutputs[groupID] += numOutputs[groupIDA]; + outputs[groupID] += outputs[groupIDA]; + + if (wroteOutPut[groupIDA]) { + string fileName = distFile + "." + toString(groupIDA) + ".temp"; + ifstream fileB(fileName.c_str(), ios::ate); + + long size; + char* memblock; + + size = fileB.tellg(); + + fileB.seekg (0, ios::beg); + + int numRead = size / 1024; + int lastRead = size % 1024; + + for (int i = 0; i < numRead; i++) { + + memblock = new char [1024]; + + fileB.read (memblock, 1024); + string temp = memblock; + outFile << temp.substr(0, 1024); + + delete memblock; + } + + memblock = new char [lastRead]; + + fileB.read (memblock, lastRead); + + //not sure why but it will read more than lastRead char...?? + string temp = memblock; + outFile << temp.substr(0, lastRead); + + delete memblock; + + fileB.close(); + remove(fileName.c_str()); + + wroteOutPut[groupID] = true; + wroteOutPut[groupIDA] = false; + } + + if (numOutputs[groupID] != 0) { + outFile << outputs[groupID]; + wroteOutPut[groupID] = true; + outputs[groupID] = ""; + numOutputs[groupID] = 0; + + outputs[groupIDA] = ""; + numOutputs[groupIDA] = 0; } - fileA.close(); - remove(fileName.c_str()); + } } } @@ -163,6 +278,15 @@ int SplitMatrix::splitDistance(){ } outFile.close(); dFile.close(); + + for (int i = 0; i < numGroups; i++) { + if (numOutputs[i] > 0) { + string fileName = distFile + "." + toString(i) + ".temp"; + outFile.open(fileName.c_str(), ios::app); + outFile << outputs[i]; + outFile.close(); + } + } ifstream bigNameFile(namefile.c_str()); if(!bigNameFile){ @@ -188,7 +312,6 @@ int SplitMatrix::splitDistance(){ for(set::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){ map::iterator nIt = nameMap.find(*gIt); - if (nIt != nameMap.end()) { smallNameFile << nIt->first << '\t' << nIt->second << endl; nameMap.erase(nIt); @@ -293,6 +416,12 @@ int SplitMatrix::splitClassify(){ remove((distFile + "." + toString(i) + ".temp").c_str()); } + + //for buffering the io to improve speed + //allow for 10 dists to be stored, then output. + vector outputs; outputs.resize(numGroups, ""); + vector numOutputs; numOutputs.resize(numGroups, 0); + //for each distance while(dFile){ string seqA, seqB; @@ -308,17 +437,32 @@ int SplitMatrix::splitClassify(){ if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons if (it->second == it2->second) { //they are from the same group so add the distance - openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); - outFile << seqA << '\t' << seqB << '\t' << dist << endl; - outFile.close(); + if (numOutputs[it->second] > 10) { + openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); + outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl; + outFile.close(); + outputs[it->second] = ""; + numOutputs[it->second] = 0; + }else{ + outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; + numOutputs[it->second]++; + } } } } dFile.close(); - for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case remove((namefile + "." + toString(i) + ".temp").c_str()); + + //write out any remaining buffers + if (numOutputs[it->second] > 0) { + openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile); + outFile << outputs[i]; + outFile.close(); + outputs[i] = ""; + numOutputs[i] = 0; + } } ifstream bigNameFile; diff --git a/tree.cpp b/tree.cpp index d6634bc..8e45981 100644 --- a/tree.cpp +++ b/tree.cpp @@ -714,9 +714,9 @@ void Tree::parseTreeFile() { } filehandle.close(); - for (int i = 0; i < globaldata->Treenames.size(); i++) { -cout << globaldata->Treenames[i] << endl; } -cout << "done" << endl; + //for (int i = 0; i < globaldata->Treenames.size(); i++) { +//cout << globaldata->Treenames[i] << endl; } +//cout << globaldata->Treenames.size() << endl; } catch(exception& e) { m->errorOut(e, "Tree", "parseTreeFile"); diff --git a/treemap.cpp b/treemap.cpp index bb62448..dcef7f7 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -26,10 +26,10 @@ void TreeMap::readMap() { while(fileHandle){ fileHandle >> seqName; //read from first column fileHandle >> seqGroup; //read from second column - + namesOfSeqs.push_back(seqName); setNamesOfGroups(seqGroup); - + treemap[seqName].groupname = seqGroup; //store data in map it2 = seqsPerGroup.find(seqGroup); -- 2.39.2