X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=cluster.cpp;h=6b69e4d5312fcce5db6c084fd42499b64724fb5e;hp=4009ed075e63c78e4fb661352c675cff3209e030;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=9c23307c583d4e8595f75278c13e708788f2f058 diff --git a/cluster.cpp b/cluster.cpp index 4009ed0..6b69e4d 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -10,284 +10,190 @@ #include "cluster.hpp" #include "rabundvector.hpp" #include "listvector.hpp" -#include "sparsematrix.hpp" /***********************************************************************/ -Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : -rabund(rav), list(lv), dMatrix(dm) +Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f, float cs) : +rabund(rav), list(lv), dMatrix(dm), method(f), adjust(cs) { -/* - cout << "sizeof(MatData): " << sizeof(MatData) << endl; - cout << "sizeof(PCell*): " << sizeof(PCell*) << endl; - - int nCells = dMatrix->getNNodes(); - time_t start = time(NULL); - - MatVec matvec = MatVec(nCells); - int i = 0; - for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { - matvec[i++] = currentCell; - } - for (i= matvec.size();i>0;i--) { - dMatrix->rmCell(matvec[i-1]); - } - MatData it = dMatrix->begin(); - while (it != dMatrix->end()) { - it = dMatrix->rmCell(it); - } - cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl; - exit(0); - MatData it = dMatrix->begin(); - cout << it->row << "/" << it->column << "/" << it->dist << endl; - dMatrix->rmCell(dMatrix->begin()); - cout << it->row << "/" << it->column << "/" << it->dist << endl; - exit(0); -*/ - - // Create a data structure to quickly access the PCell information - // for a certain sequence. It consists of a vector of lists, where - // a list contains pointers (iterators) to the all distances related - // to a certain sequence. The Vector is accessed via the index of a - // sequence in the distance matrix. - seqVec = vector(lv->size()); - for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { - seqVec[currentCell->row].push_back(currentCell); - seqVec[currentCell->column].push_back(currentCell); - } - mapWanted = false; //set to true by mgcluster to speed up overlap merge - - //save so you can modify as it changes in average neighbor - cutoff = c; -} - -/***********************************************************************/ - -void Cluster::getRowColCells() { try { - PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class - - smallRow = smallCell->row; // get its row - smallCol = smallCell->column; // get its column - smallDist = smallCell->dist; // get the smallest distance - - rowCells = seqVec[smallRow]; // all distances related to the row index - colCells = seqVec[smallCol]; // all distances related to the column index - nRowCells = rowCells.size(); - nColCells = colCells.size(); + + mapWanted = false; //set to true by mgcluster to speed up overlap merge + + //save so you can modify as it changes in average neighbor + cutoff = c; + m = MothurOut::getInstance(); } catch(exception& e) { - errorOut(e, "Cluster", "getRowColCells"); + m->errorOut(e, "Cluster", "Cluster"); exit(1); } - } /***********************************************************************/ -// Remove the specified cell from the seqVec and from the sparse -// matrix -void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix) -{ - ull drow = cell->row; - ull dcol = cell->column; - if (((vrow >=0) && (drow != smallRow)) || - ((vcol >=0) && (dcol != smallCol))) { - ull dtemp = drow; - drow = dcol; - dcol = dtemp; - } - - ull crow; - ull ccol; - int nCells; - if (vrow < 0) { - nCells = seqVec[drow].size(); - for (vrow=0; vrowrow; - ccol = seqVec[drow][vrow]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; - } - } - } - seqVec[drow].erase(seqVec[drow].begin()+vrow); - if (vcol < 0) { - nCells = seqVec[dcol].size(); - for (vcol=0; vcolrow; - ccol = seqVec[dcol][vcol]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; - } - } - } - seqVec[dcol].erase(seqVec[dcol].begin()+vcol); - if (rmMatrix) { - dMatrix->rmCell(cell); - } -} - - -/***********************************************************************/ - void Cluster::clusterBins(){ try { - // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol); - - rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); + rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); rabund->set(smallRow, 0); rabund->setLabel(toString(smallDist)); - - // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; } catch(exception& e) { - errorOut(e, "Cluster", "clusterBins"); + m->errorOut(e, "Cluster", "clusterBins"); exit(1); } - - } - /***********************************************************************/ void Cluster::clusterNames(){ try { - // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); if (mapWanted) { updateMap(); } list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); list->set(smallRow, ""); list->setLabel(toString(smallDist)); - - // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; } catch(exception& e) { - errorOut(e, "Cluster", "clusterNames"); + m->errorOut(e, "Cluster", "clusterNames"); exit(1); } - } - /***********************************************************************/ -//This function clusters based on the method of the derived class -//At the moment only average and complete linkage are covered, because -//single linkage uses a different approach. void Cluster::update(double& cutOFF){ try { - getRowColCells(); - + smallCol = dMatrix->getSmallestCell(smallRow); + nColCells = dMatrix->seqVec[smallCol].size(); + nRowCells = dMatrix->seqVec[smallRow].size(); + vector foundCol(nColCells, 0); - + //cout << dMatrix->getNNodes() << " small cell: " << smallRow << '\t' << smallCol << endl; int search; bool changed; - - // The vector has to be traversed in reverse order to preserve the index - // for faster removal in removeCell() + for (int i=nRowCells-1;i>=0;i--) { - if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) { - if (rowCells[i]->row == smallRow) { - search = rowCells[i]->column; - } else { - search = rowCells[i]->row; - } - + if (m->control_pressed) { break; } + + //if you are not the smallCell + if (dMatrix->seqVec[smallRow][i].index != smallCol) { + search = dMatrix->seqVec[smallRow][i].index; + bool merged = false; for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance - if (colCells[j]->row == search || colCells[j]->column == search) { + + if (dMatrix->seqVec[smallCol][j].index != smallRow) { //if you are not the smallest distance + if (dMatrix->seqVec[smallCol][j].index == search) { foundCol[j] = 1; merged = true; - changed = updateDistance(colCells[j], rowCells[i]); - // If the cell's distance changed and it had the same distance as - // the smallest distance, invalidate the mins vector in SparseMatrix - if (changed) { - if (colCells[j]->vectorMap != NULL) { - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; - } - } + changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]); + dMatrix->updateCellCompliment(smallCol, j); break; - } - } + }else if (dMatrix->seqVec[smallCol][j].index < search) { //we don't have a distance for this cell + if (adjust != -1.0) { //adjust + merged = true; + PDistCell value(search, adjust); //create a distance for the missing value + int location = dMatrix->addCellSorted(smallCol, value); + changed = updateDistance(dMatrix->seqVec[smallCol][location], dMatrix->seqVec[smallRow][i]); + dMatrix->updateCellCompliment(smallCol, location); + nColCells++; + foundCol.push_back(0); //add a new found column + //adjust value + for (int k = foundCol.size()-1; k > location; k--) { foundCol[k] = foundCol[k-1]; } + foundCol[location] = 1; + } + j+=nColCells; + } + } } //if not merged it you need it for warning - if (!merged) { - mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); - if (cutOFF > rowCells[i]->dist) { cutOFF = rowCells[i]->dist; mothurOut("changing cutoff to " + toString(cutOFF)); mothurOutEndLine(); } - + if ((!merged) && (method == "average" || method == "weighted")) { + if (cutOFF > dMatrix->seqVec[smallRow][i].dist) { + cutOFF = dMatrix->seqVec[smallRow][i].dist; + //cout << "changing cutoff to " << cutOFF << endl; + } + } - removeCell(rowCells[i], i , -1); - + dMatrix->rmCell(smallRow, i); } } clusterBins(); clusterNames(); - + // Special handling for singlelinkage case, not sure whether this // could be avoided for (int i=nColCells-1;i>=0;i--) { if (foundCol[i] == 0) { - if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) { - mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); - if (cutOFF > colCells[i]->dist) { cutOFF = colCells[i]->dist; mothurOut("changing cutoff to " + toString(cutOFF)); mothurOutEndLine(); } - } - removeCell(colCells[i], -1, i); + if (adjust != -1.0) { //adjust + PDistCell value(smallCol, adjust); //create a distance for the missing value + changed = updateDistance(dMatrix->seqVec[smallCol][i], value); + dMatrix->updateCellCompliment(smallCol, i); + }else { + if (method == "average" || method == "weighted") { + if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance + if (cutOFF > dMatrix->seqVec[smallCol][i].dist) { + cutOFF = dMatrix->seqVec[smallCol][i].dist; + } + } + } + } + dMatrix->rmCell(smallCol, i); } } + } catch(exception& e) { - errorOut(e, "Cluster", "update"); + m->errorOut(e, "Cluster", "update"); exit(1); } } /***********************************************************************/ -void Cluster::setMapWanted(bool m) { +void Cluster::setMapWanted(bool f) { try { - mapWanted = m; + mapWanted = f; - //initialize map - for (int i = 0; i < list->getNumBins(); i++) { - - //parse bin - string names = list->get(i); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); - //save name and bin number - seq2Bin[name] = i; - names = names.substr(names.find_first_of(',')+1, names.length()); - } - - //get last name - seq2Bin[names] = i; + //initialize map + for (int k = 0; k < list->getNumBins(); k++) { + + string names = list->get(k); + + //parse bin + string individual = ""; + int binNameslength = names.size(); + for(int j=0;jerrorOut(e, "Cluster", "setMapWanted"); exit(1); } } /***********************************************************************/ void Cluster::updateMap() { -try { + try { //update location of seqs in smallRow since they move to smallCol now string names = list->get(smallRow); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); - //save name and bin number - seq2Bin[name] = smallCol; - names = names.substr(names.find_first_of(',')+1, names.length()); - } - - //get last name - seq2Bin[names] = smallCol; + string individual = ""; + int binNameslength = names.size(); + for(int j=0;jerrorOut(e, "Cluster", "updateMap"); exit(1); } }