X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=hcluster.cpp;h=f8f48095b2ec55a8dce91d55881271f2f1837218;hp=07deaa5991c94627fa2e02a88e3b10b2729e290e;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=aa9238c0a9e6e7aa0ed8b8b606b08ad4fd7dcfe3 diff --git a/hcluster.cpp b/hcluster.cpp index 07deaa5..f8f4809 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -10,7 +10,6 @@ #include "hcluster.h" #include "rabundvector.hpp" #include "listvector.hpp" -#include "sparsematrix.hpp" /***********************************************************************/ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) { @@ -26,8 +25,8 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameA clusterArray.push_back(temp); } - if (method != "average") { - openInputFile(distfile, filehandle); + if ((method == "furthest") || (method == "nearest")) { + m->openInputFile(distfile, filehandle); }else{ processFile(); } @@ -259,7 +258,7 @@ void HCluster::updateArrayandLinkTable() { } } /***********************************************************************/ -bool HCluster::update(int row, int col, float distance){ +double HCluster::update(int row, int col, float distance){ try { bool cluster = false; smallRow = row; @@ -273,7 +272,7 @@ bool HCluster::update(int row, int col, float distance){ //you don't want to cluster with yourself if (smallRow != smallCol) { - if (method != "average") { + if ((method == "furthest") || (method == "nearest")) { //can we cluster??? if (method == "nearest") { cluster = true; } else{ //assume furthest @@ -296,7 +295,7 @@ bool HCluster::update(int row, int col, float distance){ } } - return cluster; + return cutoff; //printInfo(); } catch(exception& e) { @@ -358,7 +357,7 @@ vector HCluster::getSeqs(){ try { vector sameSeqs; - if(method != "average") { + if ((method == "furthest") || (method == "nearest")) { sameSeqs = getSeqsFNNN(); }else{ sameSeqs = getSeqsAN(); @@ -389,15 +388,15 @@ vector HCluster::getSeqsFNNN(){ //get entry while (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); //save first one if (prevDistance == -1) { prevDistance = distance; } map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } @@ -430,7 +429,6 @@ vector HCluster::getSeqsFNNN(){ } } //********************************************************************************************************************** -//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes vector HCluster::getSeqsAN(){ try { int firstName, secondName; @@ -438,7 +436,7 @@ vector HCluster::getSeqsAN(){ vector sameSeqs; prevDistance = -1; - openInputFile(distfile, filehandle, "no error"); + m->openInputFile(distfile, filehandle, "no error"); //is the smallest value in mergedMin or the distfile? float mergedMinDist = 10000; @@ -446,13 +444,13 @@ vector HCluster::getSeqsAN(){ if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; } if (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); //save first one if (prevDistance == -1) { prevDistance = distance; } if (distance != -1) { //-1 means skip me seqDist temp(firstName, secondName, distance); sameSeqs.push_back(temp); - } + }else{ distance = 10000; } } if (mergedMinDist < distance) { //get minimum distance from mergedMin @@ -469,7 +467,7 @@ vector HCluster::getSeqsAN(){ //get entry while (!filehandle.eof()) { - filehandle >> firstName >> secondName >> distance; gobble(filehandle); + filehandle >> firstName >> secondName >> distance; m->gobble(filehandle); if (prevDistance == -1) { prevDistance = distance; } @@ -511,13 +509,13 @@ int HCluster::combineFile() { string tempDistFile = distfile + ".temp"; ofstream out; - openOutputFile(tempDistFile, out); + m->openOutputFile(tempDistFile, out); //FILE* in; //in = fopen(distfile.c_str(), "rb"); ifstream in; - openInputFile(distfile, in); + m->openInputFile(distfile, in, "no error"); int first, second; float dist; @@ -550,9 +548,9 @@ int HCluster::combineFile() { //since file is sorted and mergedMin is sorted //you can put the smallest distance from each through the code below and keep the file sorted - in >> first >> second >> dist; gobble(in); + in >> first >> second >> dist; m->gobble(in); - if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; } + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; } //while there are still values in mergedMin that are smaller than the distance read from file while (count < mergedMin.size()) { @@ -572,7 +570,9 @@ int HCluster::combineFile() { smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; }else { //if no, write to temp file //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; - out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //if (mergedMin[count].dist < cutoff) { + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //} } count++; }else{ break; } @@ -592,7 +592,9 @@ int HCluster::combineFile() { }else { //if no, write to temp file //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; - out << first << '\t' << second << '\t' << dist << endl; + //if (dist < cutoff) { + out << first << '\t' << second << '\t' << dist << endl; + //} } } @@ -617,7 +619,9 @@ int HCluster::combineFile() { smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; }else { //if no, write to temp file - out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //if (mergedMin[count].dist < cutoff) { + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + //} } count++; } @@ -625,7 +629,7 @@ int HCluster::combineFile() { mergedMin.clear(); //rename tempfile to distfile - remove(distfile.c_str()); + m->mothurRemove(distfile); rename(tempDistFile.c_str(), distfile.c_str()); //cout << "remove = "<< renameOK << " rename = " << ok << endl; @@ -638,16 +642,29 @@ int HCluster::combineFile() { float average; if (it2Merge != smallRowColValues[1].end()) { //if yes, then average - //weighted average - int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; - average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + //average + if (method == "average") { + int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; + average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + }else { //weighted + average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0; + } + smallRowColValues[1].erase(it2Merge); seqDist temp(clusterArray[smallRow].parent, itMerge->first, average); mergedMin.push_back(temp); + }else { + //can't find value so update cutoff + if (cutoff > itMerge->second) { cutoff = itMerge->second; } } } - + + //update cutoff + for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) { + if (cutoff > itMerge->second) { cutoff = itMerge->second; } + } + //sort merged values sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance); @@ -688,7 +705,7 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){ if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows gotDist = true; - //gobble space + //m->gobble space while (index < size) { if (isspace(buffer[index])) { index++; } else { break; } @@ -734,29 +751,29 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){ exit(1); } } -/***********************************************************************/ +***********************************************************************/ int HCluster::processFile() { try { string firstName, secondName; float distance; ifstream in; - openInputFile(distfile, in); + m->openInputFile(distfile, in, "no error"); ofstream out; string outTemp = distfile + ".temp"; - openOutputFile(outTemp, out); + m->openOutputFile(outTemp, out); //get entry while (!in.eof()) { - if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; } + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; } - in >> firstName >> secondName >> distance; gobble(in); + in >> firstName >> secondName >> distance; m->gobble(in); map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } @@ -769,7 +786,7 @@ int HCluster::processFile() { in.close(); out.close(); - remove(distfile.c_str()); + m->mothurRemove(distfile); rename(outTemp.c_str(), distfile.c_str()); return 0;