X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=hcluster.cpp;h=f8f48095b2ec55a8dce91d55881271f2f1837218;hp=259036f0f02a42beb95926f9ce59a4140164840e;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=832d53a9dfac6b1795735eec643d8cf627b0d8e3 diff --git a/hcluster.cpp b/hcluster.cpp index 259036f..f8f4809 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -10,11 +10,11 @@ #include "hcluster.h" #include "rabundvector.hpp" #include "listvector.hpp" -#include "sparsematrix.hpp" /***********************************************************************/ -HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) { +HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) { try { + m = MothurOut::getInstance(); mapWanted = false; exitedBreak = false; numSeqs = list->getNumSeqs(); @@ -25,14 +25,14 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAs clusterArray.push_back(temp); } - if (method != "average") { - openInputFile(distfile, filehandle); + if ((method == "furthest") || (method == "nearest")) { + m->openInputFile(distfile, filehandle); }else{ processFile(); } } catch(exception& e) { - errorOut(e, "HCluster", "HCluster"); + m->errorOut(e, "HCluster", "HCluster"); exit(1); } } @@ -49,7 +49,7 @@ void HCluster::clusterBins(){ //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl; } catch(exception& e) { - errorOut(e, "HCluster", "clusterBins"); + m->errorOut(e, "HCluster", "clusterBins"); exit(1); } @@ -71,7 +71,7 @@ void HCluster::clusterNames(){ } catch(exception& e) { - errorOut(e, "HCluster", "clusterNames"); + m->errorOut(e, "HCluster", "clusterNames"); exit(1); } @@ -86,7 +86,7 @@ int HCluster::getUpmostParent(int node){ return node; } catch(exception& e) { - errorOut(e, "HCluster", "getUpmostParent"); + m->errorOut(e, "HCluster", "getUpmostParent"); exit(1); } } @@ -116,7 +116,7 @@ void HCluster::printInfo(){ } catch(exception& e) { - errorOut(e, "HCluster", "getUpmostParent"); + m->errorOut(e, "HCluster", "getUpmostParent"); exit(1); } } @@ -184,7 +184,7 @@ int HCluster::makeActive() { return linkValue; } catch(exception& e) { - errorOut(e, "HCluster", "makeActive"); + m->errorOut(e, "HCluster", "makeActive"); exit(1); } } @@ -253,12 +253,12 @@ void HCluster::updateArrayandLinkTable() { } } catch(exception& e) { - errorOut(e, "HCluster", "updateArrayandLinkTable"); + m->errorOut(e, "HCluster", "updateArrayandLinkTable"); exit(1); } } /***********************************************************************/ -bool HCluster::update(int row, int col, float distance){ +double HCluster::update(int row, int col, float distance){ try { bool cluster = false; smallRow = row; @@ -272,7 +272,7 @@ bool HCluster::update(int row, int col, float distance){ //you don't want to cluster with yourself if (smallRow != smallCol) { - if (method != "average") { + if ((method == "furthest") || (method == "nearest")) { //can we cluster??? if (method == "nearest") { cluster = true; } else{ //assume furthest @@ -295,18 +295,18 @@ bool HCluster::update(int row, int col, float distance){ } } - return cluster; + return cutoff; //printInfo(); } catch(exception& e) { - errorOut(e, "HCluster", "update"); + m->errorOut(e, "HCluster", "update"); exit(1); } } /***********************************************************************/ -void HCluster::setMapWanted(bool m) { +void HCluster::setMapWanted(bool ms) { try { - mapWanted = m; + mapWanted = ms; //initialize map for (int i = 0; i < list->getNumBins(); i++) { @@ -327,7 +327,7 @@ void HCluster::setMapWanted(bool m) { } catch(exception& e) { - errorOut(e, "HCluster", "setMapWanted"); + m->errorOut(e, "HCluster", "setMapWanted"); exit(1); } } @@ -348,7 +348,7 @@ try { seq2Bin[names] = clusterArray[smallCol].smallChild; } catch(exception& e) { - errorOut(e, "HCluster", "updateMap"); + m->errorOut(e, "HCluster", "updateMap"); exit(1); } } @@ -357,7 +357,7 @@ vector HCluster::getSeqs(){ try { vector sameSeqs; - if(method != "average") { + if ((method == "furthest") || (method == "nearest")) { sameSeqs = getSeqsFNNN(); }else{ sameSeqs = getSeqsAN(); @@ -366,7 +366,7 @@ vector HCluster::getSeqs(){ return sameSeqs; } catch(exception& e) { - errorOut(e, "HCluster", "getSeqs"); + m->errorOut(e, "HCluster", "getSeqs"); exit(1); } } @@ -388,15 +388,15 @@ vector HCluster::getSeqsFNNN(){ //get entry while (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); //save first one if (prevDistance == -1) { prevDistance = distance; } map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } @@ -424,12 +424,11 @@ vector HCluster::getSeqsFNNN(){ return sameSeqs; } catch(exception& e) { - errorOut(e, "HCluster", "getSeqsFNNN"); + m->errorOut(e, "HCluster", "getSeqsFNNN"); exit(1); } } //********************************************************************************************************************** -//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes vector HCluster::getSeqsAN(){ try { int firstName, secondName; @@ -437,7 +436,7 @@ vector HCluster::getSeqsAN(){ vector sameSeqs; prevDistance = -1; - openInputFile(distfile, filehandle, "no error"); + m->openInputFile(distfile, filehandle, "no error"); //is the smallest value in mergedMin or the distfile? float mergedMinDist = 10000; @@ -445,13 +444,13 @@ vector HCluster::getSeqsAN(){ if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; } if (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); //save first one if (prevDistance == -1) { prevDistance = distance; } if (distance != -1) { //-1 means skip me seqDist temp(firstName, secondName, distance); sameSeqs.push_back(temp); - } + }else{ distance = 10000; } } if (mergedMinDist < distance) { //get minimum distance from mergedMin @@ -468,7 +467,7 @@ vector HCluster::getSeqsAN(){ //get entry while (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); if (prevDistance == -1) { prevDistance = distance; } @@ -495,26 +494,29 @@ vector HCluster::getSeqsAN(){ return temp; } catch(exception& e) { - errorOut(e, "HCluster", "getSeqsAN"); + m->errorOut(e, "HCluster", "getSeqsAN"); exit(1); } } /***********************************************************************/ -void HCluster::combineFile() { +int HCluster::combineFile() { try { - int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware - char* inputBuffer; - inputBuffer = new char[bufferSize]; - size_t numRead; + //int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware + //char* inputBuffer; + //inputBuffer = new char[bufferSize]; + //size_t numRead; string tempDistFile = distfile + ".temp"; ofstream out; - openOutputFile(tempDistFile, out); + m->openOutputFile(tempDistFile, out); - FILE* in; - in = fopen(distfile.c_str(), "rb"); + //FILE* in; + //in = fopen(distfile.c_str(), "rb"); + ifstream in; + m->openInputFile(distfile, in, "no error"); + int first, second; float dist; @@ -524,28 +526,32 @@ void HCluster::combineFile() { //go through file pulling out distances related to rows merging //if mergedMin contains distances add those back into file - bool done = false; - partialDist = ""; - while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) { + //bool done = false; + //partialDist = ""; + //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) { //cout << "number of char read = " << numRead << endl; //cout << inputBuffer << endl; - if (numRead < bufferSize) { done = true; } + //if (numRead < bufferSize) { done = true; } //parse input into individual distances - int spot = 0; - string outputString = ""; - while(spot < numRead) { + //int spot = 0; + //string outputString = ""; + //while(spot < numRead) { //cout << "spot = " << spot << endl; - seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize); + // seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize); //you read a partial distance - if (nextDist.seq1 == -1) { break; } - - first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist; + // if (nextDist.seq1 == -1) { break; } + while (!in.eof()) { + //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist; //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl; //since file is sorted and mergedMin is sorted //you can put the smallest distance from each through the code below and keep the file sorted + in >> first >> second >> dist; m->gobble(in); + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; } + //while there are still values in mergedMin that are smaller than the distance read from file while (count < mergedMin.size()) { @@ -563,7 +569,10 @@ void HCluster::combineFile() { }else if (mergedMin[count].seq2 == smallRow) { smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; }else { //if no, write to temp file - outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; + //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; + //if (mergedMin[count].dist < cutoff) { + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //} } count++; }else{ break; } @@ -582,14 +591,18 @@ void HCluster::combineFile() { smallRowColValues[0][first] = dist; }else { //if no, write to temp file - outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; + //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; + //if (dist < cutoff) { + out << first << '\t' << second << '\t' << dist << endl; + //} } } - out << outputString; - if(done) { break; } - } - fclose(in); + //out << outputString; + //if(done) { break; } + //} + //fclose(in); + in.close(); //if values in mergedMin are larger than the the largest in file then while (count < mergedMin.size()) { @@ -606,7 +619,9 @@ void HCluster::combineFile() { smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; }else { //if no, write to temp file - out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //if (mergedMin[count].dist < cutoff) { + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //} } count++; } @@ -614,9 +629,10 @@ void HCluster::combineFile() { mergedMin.clear(); //rename tempfile to distfile - remove(distfile.c_str()); + m->mothurRemove(distfile); rename(tempDistFile.c_str(), distfile.c_str()); - +//cout << "remove = "<< renameOK << " rename = " << ok << endl; + //merge clustered rows averaging the distances map::iterator itMerge; map::iterator it2Merge; @@ -626,25 +642,40 @@ void HCluster::combineFile() { float average; if (it2Merge != smallRowColValues[1].end()) { //if yes, then average - //weighted average - int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; - average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + //average + if (method == "average") { + int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; + average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + }else { //weighted + average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0; + } + smallRowColValues[1].erase(it2Merge); seqDist temp(clusterArray[smallRow].parent, itMerge->first, average); mergedMin.push_back(temp); + }else { + //can't find value so update cutoff + if (cutoff > itMerge->second) { cutoff = itMerge->second; } } } - + + //update cutoff + for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) { + if (cutoff > itMerge->second) { cutoff = itMerge->second; } + } + //sort merged values sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance); + + return 0; } catch(exception& e) { - errorOut(e, "HCluster", "combineFile"); + m->errorOut(e, "HCluster", "combineFile"); exit(1); } } -/***********************************************************************/ +/*********************************************************************** seqDist HCluster::getNextDist(char* buffer, int& index, int size){ try { seqDist next; @@ -674,7 +705,7 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){ if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows gotDist = true; - //gobble space + //m->gobble space while (index < size) { if (isspace(buffer[index])) { index++; } else { break; } @@ -716,32 +747,33 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){ return next; } catch(exception& e) { - errorOut(e, "HCluster", "getNextDist"); + m->errorOut(e, "HCluster", "getNextDist"); exit(1); } } -/***********************************************************************/ -void HCluster::processFile() { +***********************************************************************/ +int HCluster::processFile() { try { string firstName, secondName; float distance; ifstream in; - openInputFile(distfile, in); + m->openInputFile(distfile, in, "no error"); ofstream out; string outTemp = distfile + ".temp"; - openOutputFile(outTemp, out); + m->openOutputFile(outTemp, out); //get entry while (!in.eof()) { + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; } - in >> firstName >> secondName >> distance; gobble(in); + in >> firstName >> secondName >> distance; m->gobble(in); map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } @@ -754,11 +786,13 @@ void HCluster::processFile() { in.close(); out.close(); - remove(distfile.c_str()); + m->mothurRemove(distfile); rename(outTemp.c_str(), distfile.c_str()); + + return 0; } catch(exception& e) { - errorOut(e, "HCluster", "processFile"); + m->errorOut(e, "HCluster", "processFile"); exit(1); } }