X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=cluster.cpp;h=440562c07bcfc19d426779e4f627b84f7b0df3a1;hb=260ae19c36cb11a53ddc5a75b5e507f8dd8b31d6;hp=b9754bee094c28f7921506371fedf764c76fefd7;hpb=510b1cfc25cd79391d6973ca20c5ec25fb1bb3b2;p=mothur.git diff --git a/cluster.cpp b/cluster.cpp index b9754be..440562c 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -14,84 +14,321 @@ /***********************************************************************/ -Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : -rabund(rav), list(lv), dMatrix(dm) +Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) : +rabund(rav), list(lv), dMatrix(dm), method(f) { + try { +/* + cout << "sizeof(MatData): " << sizeof(MatData) << endl; + cout << "sizeof(PCell*): " << sizeof(PCell*) << endl; + + int nCells = dMatrix->getNNodes(); + time_t start = time(NULL); + + MatVec matvec = MatVec(nCells); + int i = 0; + for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { + matvec[i++] = currentCell; + } + for (i= matvec.size();i>0;i--) { + dMatrix->rmCell(matvec[i-1]); + } + MatData it = dMatrix->begin(); + while (it != dMatrix->end()) { + it = dMatrix->rmCell(it); + } + cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl; + exit(0); + MatData it = dMatrix->begin(); + cout << it->row << "/" << it->column << "/" << it->dist << endl; + dMatrix->rmCell(dMatrix->begin()); + cout << it->row << "/" << it->column << "/" << it->dist << endl; + exit(0); +*/ + + // Create a data structure to quickly access the PCell information + // for a certain sequence. It consists of a vector of lists, where + // a list contains pointers (iterators) to the all distances related + // to a certain sequence. The Vector is accessed via the index of a + // sequence in the distance matrix. +//ofstream outtemp; +//string temp = "temp"; +//m->openOutputFile(temp, outtemp); +//cout << lv->size() << endl; + seqVec = vector(lv->size()); + for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { +//outtemp << currentCell->row << '\t' << currentCell->column << '\t' << currentCell->dist << endl; + seqVec[currentCell->row].push_back(currentCell); + seqVec[currentCell->column].push_back(currentCell); + } +//outtemp.close(); + mapWanted = false; //set to true by mgcluster to speed up overlap merge + + //save so you can modify as it changes in average neighbor + cutoff = c; + m = MothurOut::getInstance(); + + } + catch(exception& e) { + m->errorOut(e, "Cluster", "Cluster"); + exit(1); + } } /***********************************************************************/ -void Cluster::getRowColCells(){ +void Cluster::getRowColCells() { try { PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class - smallRow = smallCell->row; //get its row - smallCol = smallCell->column; //get its column - smallDist = smallCell->dist; //get the smallest distance - - rowCells.clear(); - colCells.clear(); - - for(MatData currentCell=dMatrix->begin();currentCell!=dMatrix->end();currentCell++){ + smallRow = smallCell->row; // get its row + smallCol = smallCell->column; // get its column + smallDist = smallCell->dist; // get the smallest distance + //cout << "small row = " << smallRow << "small col = " << smallCol << "small dist = " << smallDist << endl; + + rowCells = seqVec[smallRow]; // all distances related to the row index + colCells = seqVec[smallCol]; // all distances related to the column index + nRowCells = rowCells.size(); + nColCells = colCells.size(); +//cout << "num rows = " << nRowCells << "num col = " << nColCells << endl; - if(&*currentCell == smallCell){ //put the smallest cell first - rowCells.insert(rowCells.begin(), currentCell); - colCells.insert(colCells.begin(), currentCell); - } - else if(currentCell->row == smallRow){ - rowCells.push_back(currentCell); + //for (int i = 0; i < nColCells; i++) { cout << colCells[i]->row << '\t' << colCells[i]->column << endl; } + //for (int i = 0; i < nRowCells; i++) { cout << rowCells[i]->row << '\t' << rowCells[i]->column << endl; } + } + catch(exception& e) { + m->errorOut(e, "Cluster", "getRowColCells"); + exit(1); + } + +} +/***********************************************************************/ +// Remove the specified cell from the seqVec and from the sparse +// matrix +void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){ + try { + + ull drow = cell->row; + ull dcol = cell->column; + if (((vrow >=0) && (drow != smallRow)) || + ((vcol >=0) && (dcol != smallCol))) { + ull dtemp = drow; + drow = dcol; + dcol = dtemp; } - else if(currentCell->column == smallRow){ - rowCells.push_back(currentCell); + + ull crow; + ull ccol; + int nCells; + if (vrow < 0) { + nCells = seqVec[drow].size(); + for (vrow=0; vrowrow; + ccol = seqVec[drow][vrow]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } } - else if(currentCell->row == smallCol){ - colCells.push_back(currentCell); + + seqVec[drow].erase(seqVec[drow].begin()+vrow); + if (vcol < 0) { + nCells = seqVec[dcol].size(); + for (vcol=0; vcolrow; + ccol = seqVec[dcol][vcol]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } } - else if(currentCell->column == smallCol){ - colCells.push_back(currentCell); + + seqVec[dcol].erase(seqVec[dcol].begin()+vcol); + + if (rmMatrix) { + //cout << " removing = " << cell->row << '\t' << cell->column << '\t' << cell->dist << endl; + dMatrix->rmCell(cell); + // cout << "done" << endl; } - } - - nRowCells = rowCells.size(); - nColCells = colCells.size(); + } catch(exception& e) { - errorOut(e, "Cluster", "getRowColCells"); + m->errorOut(e, "Cluster", "removeCell"); exit(1); } } - /***********************************************************************/ void Cluster::clusterBins(){ try { - + // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol); + rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); rabund->set(smallRow, 0); rabund->setLabel(toString(smallDist)); + // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; } catch(exception& e) { - errorOut(e, "Cluster", "clusterBins"); + m->errorOut(e, "Cluster", "clusterBins"); exit(1); } + + } /***********************************************************************/ void Cluster::clusterNames(){ try { - + // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); + if (mapWanted) { updateMap(); } + list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); list->set(smallRow, ""); list->setLabel(toString(smallDist)); + // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; } catch(exception& e) { - errorOut(e, "Cluster", "clusterNames"); + m->errorOut(e, "Cluster", "clusterNames"); exit(1); } + } /***********************************************************************/ +//This function clusters based on the method of the derived class +//At the moment only average and complete linkage are covered, because +//single linkage uses a different approach. +void Cluster::update(double& cutOFF){ + try { + getRowColCells(); + + vector foundCol(nColCells, 0); + + int search; + bool changed; + + // The vector has to be traversed in reverse order to preserve the index + // for faster removal in removeCell() + for (int i=nRowCells-1;i>=0;i--) { + if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) { + if (rowCells[i]->row == smallRow) { + search = rowCells[i]->column; + } else { + search = rowCells[i]->row; + } + + bool merged = false; + for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance + if (colCells[j]->row == search || colCells[j]->column == search) { + foundCol[j] = 1; + merged = true; + changed = updateDistance(colCells[j], rowCells[i]); + // If the cell's distance changed and it had the same distance as + // the smallest distance, invalidate the mins vector in SparseMatrix + if (changed) { + if (colCells[j]->vectorMap != NULL) { + *(colCells[j]->vectorMap) = NULL; + colCells[j]->vectorMap = NULL; + } + } + break; + } + } + } + //if not merged it you need it for warning + if ((!merged) && (method == "average" || method == "weighted")) { + //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); + if (cutOFF > rowCells[i]->dist) { + cutOFF = rowCells[i]->dist; + //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine(); + } + + } + removeCell(rowCells[i], i , -1); + + } + } + clusterBins(); + clusterNames(); + + // Special handling for singlelinkage case, not sure whether this + // could be avoided + for (int i=nColCells-1;i>=0;i--) { + if (foundCol[i] == 0) { + if (method == "average" || method == "weighted") { + if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) { + //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); + if (cutOFF > colCells[i]->dist) { + cutOFF = colCells[i]->dist; + //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine(); + } + } + } + removeCell(colCells[i], -1, i); + } + } + } + catch(exception& e) { + m->errorOut(e, "Cluster", "update"); + exit(1); + } +} +/***********************************************************************/ +void Cluster::setMapWanted(bool f) { + try { + mapWanted = f; + + //initialize map + for (int i = 0; i < list->getNumBins(); i++) { + + //parse bin + string names = list->get(i); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = i; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = i; + } + + } + catch(exception& e) { + m->errorOut(e, "Cluster", "setMapWanted"); + exit(1); + } +} +/***********************************************************************/ +void Cluster::updateMap() { +try { + //update location of seqs in smallRow since they move to smallCol now + string names = list->get(smallRow); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = smallCol; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = smallCol; + + } + catch(exception& e) { + m->errorOut(e, "Cluster", "updateMap"); + exit(1); + } +} +/***********************************************************************/ + +