From: westcott Date: Tue, 2 Nov 2010 11:49:10 +0000 (+0000) Subject: fixed cluster.classic and added weighted method to hcluster X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=26b30b0881a37665b18746d2851607c494e8ccc0 fixed cluster.classic and added weighted method to hcluster --- diff --git a/clusterclassic.cpp b/clusterclassic.cpp index 7a701aa..d0a63b1 100644 --- a/clusterclassic.cpp +++ b/clusterclassic.cpp @@ -17,6 +17,7 @@ ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), n //save so you can modify as it changes in average neighbor cutoff = c; + aboveCutoff = cutoff + 1.0; m = MothurOut::getInstance(); globaldata = GlobalData::getInstance(); } @@ -51,10 +52,10 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { //initialize distance matrix to cutoff dMatrix.resize(nseqs); - colDist temp(0, 0, cutoff); - rowSmallDists.resize(nseqs, temp); + //colDist temp(0, 0, aboveCutoff); + //rowSmallDists.resize(nseqs, temp); for (int i = 1; i < nseqs; i++) { - dMatrix[i].resize(i, cutoff); + dMatrix[i].resize(i, aboveCutoff); } @@ -106,8 +107,8 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { if(distance < cutoff){ dMatrix[i][j] = distance; if (distance < smallDist) { smallDist = distance; } - if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } - if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } + //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } + //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } } index++; reading->update(index); @@ -134,8 +135,8 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { if (row < col) { dMatrix[col][row] = distance; } else { dMatrix[row][col] = distance; } - if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; } - if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; } + //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; } + //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; } } index++; reading->update(index); @@ -167,8 +168,8 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { if (distance < smallDist) { smallDist = distance; } dMatrix[i][j] = distance; - if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } - if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } + //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } + //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } } index++; reading->update(index); @@ -195,8 +196,8 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { if (row < col) { dMatrix[col][row] = distance; } else { dMatrix[row][col] = distance; } - if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; } - if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; } + //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; } + //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; } } index++; reading->update(index); @@ -226,20 +227,24 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { double ClusterClassic::getSmallCell() { try { - smallDist = cutoff; + smallDist = aboveCutoff; smallRow = 1; smallCol = 0; vector mins; - for(int i=0;iget(smallRow) << '\t' << rabund->get(smallCol); - rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); - rabund->set(smallRow, 0); + rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol)); + rabund->set(smallCol, 0); rabund->setLabel(toString(smallDist)); // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; @@ -289,8 +294,8 @@ void ClusterClassic::clusterNames(){ // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); if (mapWanted) { updateMap(); } - list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); - list->set(smallRow, ""); + list->set(smallRow, list->get(smallRow)+','+list->get(smallCol)); + list->set(smallCol, ""); list->setLabel(toString(smallDist)); // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; @@ -308,20 +313,21 @@ void ClusterClassic::update(double& cutOFF){ getSmallCell(); int r, c; + r = smallRow; c = smallCol; //because we only store lt, we need to make sure we grab the right location - if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value - else { r = smallRow; c = smallCol; } //smallRow is the row value + //if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value + //else { r = smallRow; c = smallCol; } //smallRow is the row value //reset rows smallest distance - rowSmallDists[r].dist = cutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0; - rowSmallDists[c].dist = cutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0; + //rowSmallDists[r].dist = aboveCutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0; + //rowSmallDists[c].dist = aboveCutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0; //if your rows smallest distance is from smallRow or smallCol, reset - for(int i=0;i r) { distRow = dMatrix[i][r]; } else { distRow = dMatrix[r][i]; } - if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = cutoff; } //like removeCell - else { distCol = dMatrix[c][i]; dMatrix[c][i] = cutoff; } + if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell + else { distCol = dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; } if(method == "furthest"){ newDist = max(distRow, distCol); } else if (method == "average"){ - if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value - }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value + if ((distRow == aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value + }else if ((distRow == aboveCutoff) && (distCol != aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value if (cutOFF > distCol) { cutOFF = distCol; } - }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value + }else if ((distRow != aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value if (cutOFF > distRow) { cutOFF = distRow; } }else { int rowBin = rabund->get(r); @@ -351,13 +357,13 @@ void ClusterClassic::update(double& cutOFF){ } } else if (method == "weighted"){ - if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value - }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value + if ((distRow == aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value + }else if ((distRow == aboveCutoff) && (distCol != aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value if (cutOFF > distCol) { cutOFF = distCol; } - }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff - newDist = cutoff; //eliminate value + }else if ((distRow != aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff + newDist = aboveCutoff; //eliminate value if (cutOFF > distRow) { cutOFF = distRow; } }else { newDist = (distCol + distRow) / 2.0; @@ -370,7 +376,7 @@ void ClusterClassic::update(double& cutOFF){ if (i > r) { dMatrix[i][r] = newDist; } else { dMatrix[r][i] = newDist; } - if (newDist < rowSmallDists[i].dist) { rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i; } + //if (newDist < rowSmallDists[i].dist) { rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i; } } //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl; } @@ -379,23 +385,23 @@ void ClusterClassic::update(double& cutOFF){ clusterNames(); //find new small for 2 rows we just merged - colDist temp(0,0,100.0); - rowSmallDists[r] = temp; + //colDist temp(0,0,100.0); + //rowSmallDists[r] = temp; - for (int i = 0; i < dMatrix[r].size(); i++) { - if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } - } - for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) { - if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } - } + //for (int i = 0; i < dMatrix[r].size(); i++) { + // if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } + //} + //for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) { + // if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } + //} - rowSmallDists[c] = temp; - for (int i = 0; i < dMatrix[c].size(); i++) { - if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } - } - for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) { - if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } - } + //rowSmallDists[c] = temp; + //for (int i = 0; i < dMatrix[c].size(); i++) { + // if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } + //} + //for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) { + // if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } + //} //cout << "after update " << endl; //print(); diff --git a/clusterclassic.h b/clusterclassic.h index 37d9b1b..9a896f2 100644 --- a/clusterclassic.h +++ b/clusterclassic.h @@ -43,19 +43,19 @@ private: int col; int row; double dist; - colDist(int i, int r, double d) : row(r), col(i), dist(d) {} + colDist(int r, int c, double d) : row(r), col(c), dist(d) {} }; RAbundVector* rabund; ListVector* list; vector< vector > dMatrix; - vector rowSmallDists; + //vector rowSmallDists; int smallRow; int smallCol, nseqs; double smallDist; bool mapWanted; - float cutoff; + double cutoff, aboveCutoff; map seq2Bin; string method; diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp index f403aa7..21840e4 100644 --- a/clusterdoturcommand.cpp +++ b/clusterdoturcommand.cpp @@ -227,8 +227,7 @@ int ClusterDoturCommand::execute(){ int estart = time(NULL); - while (cluster->getSmallDist() < cutoff && cluster->getNSeqs() > 0){ - + while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 0)){ if (m->control_pressed) { delete cluster; delete list; delete rabund; sabundFile.close();rabundFile.close();listFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear(); return 0; } cluster->update(cutoff); diff --git a/hcluster.cpp b/hcluster.cpp index 597aac8..88cba6e 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -26,7 +26,7 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameA clusterArray.push_back(temp); } - if (method != "average") { + if ((method == "furthest") || (method == "nearest")) { m->openInputFile(distfile, filehandle); }else{ processFile(); @@ -273,7 +273,7 @@ bool HCluster::update(int row, int col, float distance){ //you don't want to cluster with yourself if (smallRow != smallCol) { - if (method != "average") { + if ((method == "furthest") || (method == "nearest")) { //can we cluster??? if (method == "nearest") { cluster = true; } else{ //assume furthest @@ -358,7 +358,7 @@ vector HCluster::getSeqs(){ try { vector sameSeqs; - if(method != "average") { + if ((method == "furthest") || (method == "nearest")) { sameSeqs = getSeqsFNNN(); }else{ sameSeqs = getSeqsAN(); @@ -638,9 +638,14 @@ int HCluster::combineFile() { float average; if (it2Merge != smallRowColValues[1].end()) { //if yes, then average - //weighted average - int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; - average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + //average + if (method == "average") { + int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; + average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + }else { //weighted + average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0; + } + smallRowColValues[1].erase(it2Merge); seqDist temp(clusterArray[smallRow].parent, itMerge->first, average); diff --git a/hclustercommand.cpp b/hclustercommand.cpp index 5c08320..69dbef6 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -171,8 +171,8 @@ HClusterCommand::HClusterCommand(string option) { method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; } - if ((method == "furthest") || (method == "nearest") || (method == "average")) { } - else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; } + if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) { } + else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest, average or weighted."); m->mothurOutEndLine(); abort = true; } showabund = validParameter.validFile(parameters, "showabund", false); if (showabund == "not found") { showabund = "T"; } @@ -192,6 +192,7 @@ HClusterCommand::HClusterCommand(string option) { if (method == "furthest") { tag = "fn"; } else if (method == "nearest") { tag = "nn"; } + else if (method == "weighted") { tag = "wn"; } else { tag = "an"; } m->openOutputFile(fileroot+ tag + ".sabund", sabundFile); @@ -219,7 +220,7 @@ void HClusterCommand::help(){ m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); m->mothurOut("The hcluster command should be in the following format: \n"); m->mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - m->mothurOut("The acceptable hcluster methods are furthest, nearest and average.\n\n"); + m->mothurOut("The acceptable hcluster methods are furthest, nearest, weighted and average.\n\n"); } catch(exception& e) { m->errorOut(e, "HClusterCommand", "help"); diff --git a/mothur b/mothur index ba4b3d6..197583b 100755 Binary files a/mothur and b/mothur differ