From 1b9d0a66e4737f31d16824fe93944880b9edc530 Mon Sep 17 00:00:00 2001 From: westcott Date: Thu, 30 Jul 2009 12:03:32 +0000 Subject: [PATCH] Thallinger changes to cluster command. --- averagelinkage.cpp | 80 +++++++------------ cluster.cpp | 189 +++++++++++++++++++++++++++++++++++++------- cluster.hpp | 35 +++++--- clustercommand.cpp | 97 ++++++++++++++++------- clustercommand.h | 7 +- completelinkage.cpp | 64 ++++----------- decalc.cpp | 88 +++++++++++++++++---- decalc.h | 2 + pintail.cpp | 12 ++- readdistcommand.cpp | 41 +++++++++- singlelinkage.cpp | 115 ++++++++++++++++++--------- sparsematrix.cpp | 22 ++---- sparsematrix.hpp | 10 ++- 13 files changed, 515 insertions(+), 247 deletions(-) diff --git a/averagelinkage.cpp b/averagelinkage.cpp index 26283ff..7a4cb88 100644 --- a/averagelinkage.cpp +++ b/averagelinkage.cpp @@ -12,68 +12,44 @@ /***********************************************************************/ AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : -Cluster(rav, lv, dm) -{} + Cluster(rav, lv, dm) +{ + saveRow = -1; + saveCol = -1; +} + + +/***********************************************************************/ +//This function returns the tag of the method. +string AverageLinkage::getTag() { + return("an"); +} + /***********************************************************************/ -//THis function clusters based on the average method -void AverageLinkage::update(){ - try{ - getRowColCells(); - - vector found(nColCells, 0); - - int rowBin = rabund->get(smallRow); - int colBin = rabund->get(smallCol); - int totalBin = rowBin + colBin; - - for(int i=1;irow == smallRow){ - search = rowCells[i]->column; - } - else{ - search = rowCells[i]->row; - } - - for(int j=1;jrow == search || colCells[j]->column == search){ - colCells[j]->dist = (colBin * colCells[j]->dist + rowBin * rowCells[i]->dist) / totalBin; - - found[j] = 1; - - if(colCells[j]->vectorMap != NULL){ - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; - } - - break; - } - - } - dMatrix->rmCell(rowCells[i]); - } - - clusterBins(); - clusterNames(); - - for(int i=0;irmCell(colCells[i]); - } +//This function updates the distance based on the average linkage method. +bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) { + try { + if ((saveRow != smallRow) || (saveCol != smallCol)) { + rowBin = rabund->get(smallRow); + colBin = rabund->get(smallCol); + totalBin = rowBin + colBin; + saveRow = smallRow; + saveCol = smallCol; } + + colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin; + return(true); } catch(exception& e) { - errorOut(e, "AverageLinkage", "update"); + errorOut(e, "AverageLinkage", "updateDistance"); exit(1); } } /***********************************************************************/ -#endif +/***********************************************************************/ +#endif diff --git a/cluster.cpp b/cluster.cpp index b9754be..6fd4631 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -17,41 +17,58 @@ Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : rabund(rav), list(lv), dMatrix(dm) { +/* + cout << "sizeof(MatData): " << sizeof(MatData) << endl; + cout << "sizeof(PCell*): " << sizeof(PCell*) << endl; + + int nCells = dMatrix->getNNodes(); + time_t start = time(NULL); + + MatVec matvec = MatVec(nCells); + int i = 0; + for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { + matvec[i++] = currentCell; + } + for (i= matvec.size();i>0;i--) { + dMatrix->rmCell(matvec[i-1]); + } + MatData it = dMatrix->begin(); + while (it != dMatrix->end()) { + it = dMatrix->rmCell(it); + } + cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl; + exit(0); + MatData it = dMatrix->begin(); + cout << it->row << "/" << it->column << "/" << it->dist << endl; + dMatrix->rmCell(dMatrix->begin()); + cout << it->row << "/" << it->column << "/" << it->dist << endl; + exit(0); +*/ + + // Create a data structure to quickly access the PCell information + // for a certain sequence. It consists of a vector of lists, where + // a list contains pointers (iterators) to the all distances related + // to a certain sequence. The Vector is accessed via the index of a + // sequence in the distance matrix. + seqVec = vector(lv->size()); + for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { + seqVec[currentCell->row].push_back(currentCell); + seqVec[currentCell->column].push_back(currentCell); + } } /***********************************************************************/ -void Cluster::getRowColCells(){ +void Cluster::getRowColCells() { try { PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class - smallRow = smallCell->row; //get its row - smallCol = smallCell->column; //get its column - smallDist = smallCell->dist; //get the smallest distance - - rowCells.clear(); - colCells.clear(); - - for(MatData currentCell=dMatrix->begin();currentCell!=dMatrix->end();currentCell++){ - - if(&*currentCell == smallCell){ //put the smallest cell first - rowCells.insert(rowCells.begin(), currentCell); - colCells.insert(colCells.begin(), currentCell); - } - else if(currentCell->row == smallRow){ - rowCells.push_back(currentCell); - } - else if(currentCell->column == smallRow){ - rowCells.push_back(currentCell); - } - else if(currentCell->row == smallCol){ - colCells.push_back(currentCell); - } - else if(currentCell->column == smallCol){ - colCells.push_back(currentCell); - } - } + smallRow = smallCell->row; // get its row + smallCol = smallCell->column; // get its column + smallDist = smallCell->dist; // get the smallest distance + rowCells = seqVec[smallRow]; // all distances related to the row index + colCells = seqVec[smallCol]; // all distances related to the column index nRowCells = rowCells.size(); nColCells = colCells.size(); } @@ -59,39 +76,153 @@ void Cluster::getRowColCells(){ errorOut(e, "Cluster", "getRowColCells"); exit(1); } + } +// Remove the specified cell from the seqVec and from the sparse +// matrix +void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix) +{ + ull drow = cell->row; + ull dcol = cell->column; + if (((vrow >=0) && (drow != smallRow)) || + ((vcol >=0) && (dcol != smallCol))) { + ull dtemp = drow; + drow = dcol; + dcol = dtemp; + } + + ull crow; + ull ccol; + int nCells; + if (vrow < 0) { + nCells = seqVec[drow].size(); + for (vrow=0; vrowrow; + ccol = seqVec[drow][vrow]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } + } + seqVec[drow].erase(seqVec[drow].begin()+vrow); + if (vcol < 0) { + nCells = seqVec[dcol].size(); + for (vcol=0; vcolrow; + ccol = seqVec[dcol][vcol]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } + } + seqVec[dcol].erase(seqVec[dcol].begin()+vcol); + if (rmMatrix) { + dMatrix->rmCell(cell); + } +} + + /***********************************************************************/ void Cluster::clusterBins(){ try { - + // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol); + rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); rabund->set(smallRow, 0); rabund->setLabel(toString(smallDist)); + // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; } catch(exception& e) { errorOut(e, "Cluster", "clusterBins"); exit(1); } + + } /***********************************************************************/ void Cluster::clusterNames(){ try { - + // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); + list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); list->set(smallRow, ""); list->setLabel(toString(smallDist)); + // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; } catch(exception& e) { errorOut(e, "Cluster", "clusterNames"); exit(1); } + } +/***********************************************************************/ +//This function clusters based on the method of the derived class +//At the moment only average and complete linkage are covered, because +//single linkage uses a different approach. +void Cluster::update(){ + try { + getRowColCells(); + + vector found(nColCells, 0); + int search; + bool changed; + + // The vector has to be traversed in reverse order to preserve the index + // for faster removal in removeCell() + for (int i=nRowCells-1;i>=0;i--) { + if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) { + if (rowCells[i]->row == smallRow) { + search = rowCells[i]->column; + } else { + search = rowCells[i]->row; + } + + for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { + if (colCells[j]->row == search || colCells[j]->column == search) { + found[j] = 1; + changed = updateDistance(colCells[j], rowCells[i]); + // If the cell's distance changed and it had the same distance as + // the smallest distance, invalidate the mins vector in SparseMatrix + if (changed) { + if (colCells[j]->vectorMap != NULL) { + *(colCells[j]->vectorMap) = NULL; + colCells[j]->vectorMap = NULL; + } + } + break; + } + } + } + removeCell(rowCells[i], i , -1); + } + } + clusterBins(); + clusterNames(); + + // Special handling for singlelinkage case, not sure whether this + // could be avoided + for (int i=nColCells-1;i>=0;i--) { + if (found[i] == 0) { + removeCell(colCells[i], -1, i); + } + } + } + catch(exception& e) { + errorOut(e, "Cluster", "update"); + exit(1); + } +} + + /***********************************************************************/ diff --git a/cluster.hpp b/cluster.hpp index de86f85..99d8b7d 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -7,18 +7,22 @@ class RAbundVector; class ListVector; -class SparseMatrix; -typedef list::iterator MatData; +typedef vector MatVec; class Cluster { public: Cluster(RAbundVector*, ListVector*, SparseMatrix*); - virtual void update() = 0; - + virtual void update(); + virtual string getTag() = 0; + protected: void getRowColCells(); + void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true); + + virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0; + virtual void clusterBins(); virtual void clusterNames(); @@ -29,8 +33,10 @@ protected: int smallRow; int smallCol; float smallDist; - vector rowCells; - vector colCells; + + vector seqVec; // contains vectors of cells related to a certain sequence + MatVec rowCells; + MatVec colCells; ull nRowCells; ull nColCells; }; @@ -40,7 +46,8 @@ protected: class CompleteLinkage : public Cluster { public: CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*); - void update(); + bool updateDistance(MatData& colCell, MatData& rowCell); + string getTag(); private: @@ -51,7 +58,9 @@ private: class SingleLinkage : public Cluster { public: SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*); - void update(); + void update(); + bool updateDistance(MatData& colCell, MatData& rowCell); + string getTag(); private: @@ -62,10 +71,16 @@ private: class AverageLinkage : public Cluster { public: AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*); - void update(); + bool updateDistance(MatData& colCell, MatData& rowCell); + string getTag(); private: - + int saveRow; + int saveCol; + int rowBin; + int colBin; + int totalBin; + }; /***********************************************************************/ diff --git a/clustercommand.cpp b/clustercommand.cpp index 593f262..d3b9bbf 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -21,7 +21,7 @@ ClusterCommand::ClusterCommand(string option){ else { //valid paramters for this command - string Array[] = {"cutoff","precision","method"}; + string Array[] = {"cutoff","precision","method","showabund","timing"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -31,33 +31,43 @@ ClusterCommand::ClusterCommand(string option){ //check to make sure all parameters are valid for command for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { + abort = true; + } } //error checking to make sure they read a distance file if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) { - mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine(); abort = true; + mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine(); + abort = true; } //check for optional parameter and set defaults // ...at some point should added some additional type checking... //get user cutoff and precision or use defaults string temp; - temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } + temp = validParameter.validFile(parameters, "precision", false); + if (temp == "not found") { temp = "100"; } //saves precision legnth for formatting below length = temp.length(); convert(temp, precision); - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } + temp = validParameter.validFile(parameters, "cutoff", false); + if (temp == "not found") { temp = "10"; } convert(temp, cutoff); cutoff += (5 / (precision * 10.0)); - method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; } - + method = validParameter.validFile(parameters, "method", false); + if (method == "not found") { method = "furthest"; } if ((method == "furthest") || (method == "nearest") || (method == "average")) { } else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; } + showabund = validParameter.validFile(parameters, "showabund", false); + if (showabund == "not found") { showabund = "T"; } + + timing = validParameter.validFile(parameters, "timing", false); + if (timing == "not found") { timing = "F"; } if (abort == false) { @@ -70,25 +80,21 @@ ClusterCommand::ClusterCommand(string option){ } //create cluster - if(method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix); tag = "fn"; } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix); tag = "nn"; } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix); tag = "an"; } - else { mothurOut("error - not recognized method"); mothurOutEndLine(); abort = true; } - + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix); } + tag = cluster->getTag(); + fileroot = getRootName(globaldata->inputFileName); openOutputFile(fileroot+ tag + ".sabund", sabundFile); openOutputFile(fileroot+ tag + ".rabund", rabundFile); openOutputFile(fileroot+ tag + ".list", listFile); - - } - } - } catch(exception& e) { - errorOut(e, "ClusterCommand", "ClusterCommand"); + errorOut(e, "ClusterCommand", "ClusterCommand"); exit(1); } } @@ -97,11 +103,11 @@ ClusterCommand::ClusterCommand(string option){ void ClusterCommand::help(){ try { - mothurOut("The cluster command can only be executed after a successful read.dist command.\n"); - mothurOut("The cluster command parameter options are method, cuttoff and precision. No parameters are required.\n"); - mothurOut("The cluster command should be in the following format: \n"); - mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n"); + mothurOut("The cluster command can only be executed after a successful read.dist command.\n"); + mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n"); + mothurOut("The cluster command should be in the following format: \n"); + mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); + mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n"); } catch(exception& e) { errorOut(e, "ClusterCommand", "help"); @@ -125,16 +131,28 @@ int ClusterCommand::execute(){ if (abort == true) { return 0; } + time_t estart = time(NULL); + int ndist = matrix->getNNodes(); float previousDist = 0.00000; float rndPreviousDist = 0.00000; oldRAbund = *rabund; oldList = *list; + + print_start = true; + start = time(NULL); + loops = 0; - float x; - x=0.1; - toString(x, 2); - - while(matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ + while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ + if (print_start && isTrue(timing)) { + mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" + + toString(roundDist(matrix->getSmallDist(), precision)) + + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); + cout.flush(); + print_start = false; + } + + loops++; + cluster->update(); float dist = matrix->getSmallDist(); float rndDist = roundDist(dist, precision); @@ -151,6 +169,13 @@ int ClusterCommand::execute(){ oldRAbund = *rabund; oldList = *list; } + + if (print_start && isTrue(timing)) { + mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) + + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); + cout.flush(); + print_start = false; + } if(previousDist <= 0.0000){ printData("unique"); @@ -174,7 +199,9 @@ int ClusterCommand::execute(){ sabundFile.close(); rabundFile.close(); listFile.close(); - + if (isTrue(timing)) { + mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster " + toString(ndist) + " distances"); mothurOutEndLine(); + } return 0; } catch(exception& e) { @@ -187,8 +214,18 @@ int ClusterCommand::execute(){ void ClusterCommand::printData(string label){ try { + if (isTrue(timing)) { + mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) + + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine(); + } + print_start = true; + loops = 0; + start = time(NULL); + oldRAbund.setLabel(label); - oldRAbund.getSAbundVector().print(cout); + if (isTrue(showabund)) { + oldRAbund.getSAbundVector().print(cout); + } oldRAbund.print(rabundFile); oldRAbund.getSAbundVector().print(sabundFile); @@ -199,5 +236,7 @@ void ClusterCommand::printData(string label){ errorOut(e, "ClusterCommand", "printData"); exit(1); } + + } //********************************************************************************************************************** diff --git a/clustercommand.h b/clustercommand.h index 44ce6f9..ad9fe85 100644 --- a/clustercommand.h +++ b/clustercommand.h @@ -26,8 +26,6 @@ The cluster() command outputs three files *.list, *.rabund, and *.sabund. */ -class GlobalData; - class ClusterCommand : public Command { public: @@ -49,8 +47,13 @@ private: string method, fileroot, tag; double cutoff; + string showabund, timing; int precision, length; ofstream sabundFile, rabundFile, listFile; + + bool print_start; + time_t start; + unsigned long loops; void printData(string label); }; diff --git a/completelinkage.cpp b/completelinkage.cpp index f69753b..09ba963 100644 --- a/completelinkage.cpp +++ b/completelinkage.cpp @@ -1,64 +1,32 @@ #include "cluster.hpp" -#include "mothur.h" /***********************************************************************/ CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : -Cluster(rav, lv, dm) + Cluster(rav, lv, dm) {} /***********************************************************************/ -//This function clusters based on the furthest neighbor method. -void CompleteLinkage::update(){ +//This function returns the tag of the method. +string CompleteLinkage::getTag() { + return("fn"); +} + + +/***********************************************************************/ +//This function updates the distance based on the furthest neighbor method. +bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) { try { - getRowColCells(); - - vector found(nColCells, 0); - - for(int i=1;irow == smallRow){ - search = rowCells[i]->column; - } - else{ - search = rowCells[i]->row; - } - - for(int j=1;jrow == search || colCells[j]->column == search){ - - if(colCells[j]->dist < rowCells[i]->dist){ - colCells[j]->dist = rowCells[i]->dist; - - if(colCells[j]->vectorMap != NULL){ - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; - } - - } - - found[j] = 1; - break; - } - } - dMatrix->rmCell(rowCells[i]); - + bool changed = false; + if (colCell->dist < rowCell->dist) { + colCell->dist = rowCell->dist; + changed = true; } - clusterBins(); - clusterNames(); - - for(int i=0;irmCell(colCells[i]); - } - } + return(changed); } catch(exception& e) { - errorOut(e, "CompleteLinkage", "update"); + errorOut(e, "CompleteLinkage", "updateDistance"); exit(1); } } diff --git a/decalc.cpp b/decalc.cpp index 1a73494..203482f 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -59,11 +59,13 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map& t int front = 0; for (int i = 0; i < q.length(); i++) { +//cout << "query = " << q[i] << " subject = " << s[i] << endl; if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; } } - +//cout << endl << endl; int back = 0; for (int i = q.length(); i >= 0; i--) { +//cout << "query = " << q[i] << " subject = " << s[i] << endl; if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; } } @@ -97,10 +99,10 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int //count bases int numBases = 0; for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } } - +//cout << "num Bases = " << numBases << endl; //save start of seq win.push_back(front); - +//cout << front << '\t'; //move ahead increment bases at a time until all bases are in a window int countBases = 0; int totalBases = 0; //used to eliminate window of blanks at end of sequence @@ -109,18 +111,27 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int for (int m = front; m < (back - size) ; m++) { //count number of bases you see - if (isalpha(seq[m])) { countBases++; totalBases++; } + if (isalpha(seq[m])) { countBases++; } //if you have seen enough bases to make a new window if (countBases >= increment) { + //total bases is the number of bases in a window already. + totalBases += countBases; +//cout << "total bases = " << totalBases << endl; win.push_back(m); //save spot in alignment +//cout << m << '\t'; countBases = 0; //reset bases you've seen in this window } //no need to continue if all your bases are in a window if (totalBases == numBases) { break; } } - + + + //get last window if needed + if (totalBases < numBases) { win.push_back(back-size); cout << back-size << endl;} +//cout << endl << endl; + return win; } @@ -239,6 +250,12 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { openOutputFile(freqfile, outFreq); + string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3 + int precision = length.length() - 1; + + //format output + outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint); + //at each position in the sequence for (int i = 0; i < seqs[0]->getAligned().length(); i++) { @@ -261,11 +278,8 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { int highest = 0; for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } - float highFreq; - //subtract gaps to "ignore them" - if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } - else { highFreq = highest / (float) (seqs.size() - gaps); } - + float highFreq = highest / (float) (seqs.size()); + float Pi; Pi = (highFreq - 0.25) / 0.75; @@ -273,10 +287,9 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { if (Pi < 0) { Pi = 0.0; } //saves this for later - outFreq << i+1 << '\t' << highFreq << endl; - + outFreq << setprecision(precision) << i << '\t' << highFreq << endl; + if (h.count(i) > 0) { - cout << i+1 << '\t' << highFreq << endl; prob.push_back(Pi); } } @@ -387,6 +400,52 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, vecto exit(1); } } +//*************************************************************************************************************** +void DeCalculator::removeObviousOutliers(vector< vector >& quantiles) { + try { + + + for (int i = 0; i < quantiles.size(); i++) { + + //find mean of this quantile score + float average = findAverage(quantiles[i]); + + vector newQuanI; + //look at each value in quantiles to see if it is an outlier + for (int j = 0; j < quantiles[i].size(); j++) { + + float highCutoff, lowCutOff; + + + + } + + } + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeObviousOutliers"); + exit(1); + } +} + +//*************************************************************************************************************** +float DeCalculator::findAverage(vector myVector) { + try{ + + float total = 0.0; + for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; } + + float average = total / (float) myVector.size(); + + return average; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findAverage"); + exit(1); + } +} //*************************************************************************************************************** float DeCalculator::getCoef(vector obs, vector qav) { @@ -402,7 +461,8 @@ float DeCalculator::getCoef(vector obs, vector qav) { for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; } obsAverage = obsAverage / (float) obs.size(); //cout << "sum ai / m = " << probAverage << endl; -//cout << "sum oi / m = " << obsAverage << endl; +//cout << "sum oi / m = " << obsAverage << endl; + float coef = obsAverage / probAverage; return coef; diff --git a/decalc.h b/decalc.h index 9309c37..1579d86 100644 --- a/decalc.h +++ b/decalc.h @@ -31,6 +31,7 @@ class DeCalculator { void setMask(string m); void runMask(Sequence*); void trimSeqs(Sequence*, Sequence*, map&); + void removeObviousOutliers(vector< vector >&); vector calcFreq(vector, string); vector findWindows(Sequence*, int, int, int&, int); vector calcObserved(Sequence*, Sequence*, vector, int); @@ -42,6 +43,7 @@ class DeCalculator { vector< vector > getQuantiles(vector, vector, int, vector, int, int, int); private: + float findAverage(vector myVector); string seqMask; set h; diff --git a/pintail.cpp b/pintail.cpp index b7e5b5f..bca7df9 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -34,6 +34,11 @@ void Pintail::print(ostream& out) { for (int i = 0; i < querySeqs.size(); i++) { int index = ceil(deviation[i]); +float quan = 2.64 * log10(deviation[i]) + 1.46; +cout << "dist = " << index << endl; +cout << "de = " << DE[i] << endl; +cout << "mallard quantile = " << quan << endl; +cout << "my quantile = " << quantiles[index][4] << endl; //is your DE value higher than the 95% string chimera; @@ -158,7 +163,7 @@ void Pintail::getChimeras() { }else { probabilityProfile = readFreq(); } //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; cout << i << '\t' << probabilityProfile[i] << endl; } + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; mothurOut("Done."); mothurOutEndLine(); //mask querys @@ -270,6 +275,9 @@ cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl; quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + + decalc->removeObviousOutliers(quantiles); + ofstream out4; string o = getRootName(templateFile) + "quan"; @@ -344,7 +352,7 @@ vector Pintail::readFreq() { in >> pos >> num; - if (h.count(pos-1) > 0) { + if (h.count(pos) > 0) { float Pi; Pi = (num - 0.25) / 0.75; diff --git a/readdistcommand.cpp b/readdistcommand.cpp index 088f711..579f848 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -70,7 +70,7 @@ ReadDistCommand::ReadDistCommand(string option){ else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a read.dist command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; } + if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } } //check for optional parameter and set defaults @@ -129,7 +129,7 @@ void ReadDistCommand::help(){ mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"); mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n"); mothurOut("For this use the read.dist command should be in the following format: \n"); - mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n"); + mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n"); mothurOut("Note: No spaces between parameter labels (i.e. phylip), '=' and parameters (i.e.yourPhylipfile).\n\n"); } catch(exception& e) { @@ -141,6 +141,9 @@ void ReadDistCommand::help(){ //********************************************************************************************************************** ReadDistCommand::~ReadDistCommand(){ + if (abort == false) { + if (format != "matrix") { delete read; delete nameMap; } + } } //********************************************************************************************************************** @@ -148,6 +151,9 @@ int ReadDistCommand::execute(){ try { if (abort == true) { return 0; } + + time_t start = time(NULL); + size_t numDists = 0; if (format == "matrix") { ifstream in; @@ -157,7 +163,8 @@ int ReadDistCommand::execute(){ //memory leak prevention if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix; } globaldata->gMatrix = matrix; //save matrix for coverage commands - }else { + numDists = matrix->getSizes()[1]; + } else { read->read(nameMap); //to prevent memory leak @@ -166,10 +173,36 @@ int ReadDistCommand::execute(){ if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; } globaldata->gSparseMatrix = read->getMatrix(); - delete read; delete nameMap; + numDists = globaldata->gSparseMatrix->getNNodes(); + + int lines = cutoff / (1.0/precision); + vector dist_cutoff(lines+1,0); + for (int i = 0; i <= lines;i++) { + dist_cutoff[i] = (i + 0.5) / precision; + } + vector dist_count(lines+1,0); + list::iterator currentCell; + SparseMatrix* smatrix = globaldata->gSparseMatrix; + for (currentCell = smatrix->begin(); currentCell != smatrix->end(); currentCell++) { + for (int i = 0; i <= lines;i++) { + if (currentCell->dist < dist_cutoff[i]) { + dist_count[i]++; + break; + } + } + } + string dist_string = "Dist:"; + string count_string = "Count: "; + for (int i = 0; i <= lines;i++) { + dist_string = dist_string.append("\t").append(toString(dist_cutoff[i])); + count_string = count_string.append("\t").append(toString(dist_count[i])); + } + mothurOut(dist_string); mothurOutEndLine(); mothurOut(count_string); mothurOutEndLine(); } + mothurOut("It took " + toString(time(NULL) - start) + " secs to read " + toString(numDists) + " distances (cutoff: " + toString(cutoff) + ")"); mothurOutEndLine(); return 0; + } catch(exception& e) { errorOut(e, "ReadDistCommand", "execute"); diff --git a/singlelinkage.cpp b/singlelinkage.cpp index 5332e90..c27b14e 100644 --- a/singlelinkage.cpp +++ b/singlelinkage.cpp @@ -9,54 +9,75 @@ SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm Cluster(rav, lv, dm) {} + /***********************************************************************/ -//This function clusters based on nearest method. -void SingleLinkage::update(){ +//This function returns the tag of the method. +string SingleLinkage::getTag() { + return("nn"); +} + +/***********************************************************************/ +//This function clusters based on the single linkage method. +void SingleLinkage::update(){ try { - getRowColCells(); + getRowColCells(); - for(int i=1;irow == smallRow){ - search = rowCells[i]->column; - } - else{ - search = rowCells[i]->row; - } + vector deleted(nRowCells, false); + int rowInd; + int search; + bool changed; + + // The vector has to be traversed in reverse order to preserve the index + // for faster removal in removeCell() + for (int i=nRowCells-1;i>=0;i--) { + if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) { + rowInd = i; // The index of the smallest distance cell in rowCells + } else { + if (rowCells[i]->row == smallRow) { + search = rowCells[i]->column; + } else { + search = rowCells[i]->row; + } - for(int j=1;jrow == search || colCells[j]->column == search){ - - if(colCells[j]->dist > rowCells[i]->dist){ - colCells[j]->dist = rowCells[i]->dist; - - if(colCells[j]->vectorMap != NULL){ - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; + for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { + if (colCells[j]->row == search || colCells[j]->column == search) { + changed = updateDistance(colCells[j], rowCells[i]); + // If the cell's distance changed and it had the same distance as + // the smallest distance, invalidate the mins vector in SparseMatrix + if (changed) { + if (colCells[j]->vectorMap != NULL) { + *(colCells[j]->vectorMap) = NULL; + colCells[j]->vectorMap = NULL; + } + } + removeCell(rowCells[i], i , -1); + deleted[i] = true; + break; } - } - dMatrix->rmCell(rowCells[i]); - break; } - } - - if(search < smallCol){ - rowCells[i]->row = smallCol; - rowCells[i]->column = search; - } - else{ - rowCells[i]->row = search; - rowCells[i]->column = smallCol; - } - - } + if (!deleted[i]) { + // Assign the cell to the new cluster + // remove the old cell from seqVec and add the cell + // with the new row and column assignment again + removeCell(rowCells[i], i , -1, false); + if (search < smallCol){ + rowCells[i]->row = smallCol; + rowCells[i]->column = search; + } else { + rowCells[i]->row = search; + rowCells[i]->column = smallCol; + } + seqVec[rowCells[i]->row].push_back(rowCells[i]); + seqVec[rowCells[i]->column].push_back(rowCells[i]); + } + } + } clusterBins(); clusterNames(); - dMatrix->rmCell(rowCells[0]); + // remove also the cell with the smallest distance + removeCell(rowCells[rowInd], -1 , -1); } catch(exception& e) { errorOut(e, "SingleLinkage", "update"); @@ -64,4 +85,20 @@ void SingleLinkage::update(){ } } + +/***********************************************************************/ +//This function updates the distance based on the nearest neighbor method. +bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) { + try { + bool changed = false; + if (colCell->dist > rowCell->dist) { + colCell->dist = rowCell->dist; + } + return(changed); + } + catch(exception& e) { + errorOut(e, "SingleLinkage", "updateDistance"); + exit(1); + } +} /***********************************************************************/ diff --git a/sparsematrix.cpp b/sparsematrix.cpp index 7f1c3c8..8a873f7 100644 --- a/sparsematrix.cpp +++ b/sparsematrix.cpp @@ -2,7 +2,6 @@ #include "sparsematrix.hpp" #include "listvector.hpp" -typedef list::iterator MatData; /***********************************************************************/ @@ -22,15 +21,15 @@ float SparseMatrix::getSmallDist(){ /***********************************************************************/ -void SparseMatrix::rmCell(MatData data){ +MatData SparseMatrix::rmCell(MatData data){ try { if(data->vectorMap != NULL ){ *(data->vectorMap) = NULL; data->vectorMap = NULL; } - matrix.erase(data); + data = matrix.erase(data); numNodes--; - + return(data); // seems like i should be updating smallDist here, but the only time we remove cells is when // clustering and the clustering algorithm updates smallDist } @@ -89,13 +88,11 @@ MatData SparseMatrix::end(){ void SparseMatrix::print(){ try { int index = 0; - - mothurOutEndLine(); - mothurOut("Index\tRow\tColumn\tDistance"); - mothurOutEndLine(); + + cout << endl << "Index\tRow\tColumn\tDistance" << endl; for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){ - mothurOut(toString(index) + "\t" + toString(currentCell->row) + "\t" + toString(currentCell->column) + "\t" + toString(currentCell->dist)); mothurOutEndLine(); + cout << index << '\t' << currentCell->row << '\t' << currentCell->column << '\t' << currentCell->dist << endl; index++; } } @@ -111,10 +108,7 @@ void SparseMatrix::print(ListVector* list){ try { int index = 0; - mothurOutEndLine(); - mothurOut("Index\tRow\tColumn\tDistance"); - mothurOutEndLine(); - + mothurOutEndLine(); mothurOut("Index\tRow\tColumn\tDistance"); mothurOutEndLine(); for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){ mothurOut(toString(index) + "\t" + toString(list->get(currentCell->row)) + "\t" + toString(list->get(currentCell->column)) + "\t" + toString(currentCell->dist)); mothurOutEndLine(); @@ -159,7 +153,7 @@ PCell* SparseMatrix::getSmallestCell(){ } } - random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector +// random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector for(int i=0;ivectorMap = &mins[i]; //assign vectorMap to the address for the container diff --git a/sparsematrix.hpp b/sparsematrix.hpp index d65771e..8eeda7f 100644 --- a/sparsematrix.hpp +++ b/sparsematrix.hpp @@ -20,21 +20,23 @@ class ListVector; /***********************************************************************/ +typedef list::iterator MatData; + class SparseMatrix { public: SparseMatrix(); int getNNodes(); void print(); //Print the contents of the matrix - void print(ListVector*); //Print the contents of the matrix + void print(ListVector*); //Print the contents of the matrix PCell* getSmallestCell(); //Return the cell with the smallest distance float getSmallDist(); - void rmCell(list::iterator); + MatData rmCell(MatData); void addCell(PCell); void clear(); - list::iterator begin(); - list::iterator end(); + MatData begin(); + MatData end(); private: PCell* smallCell; //The cell with the smallest distance -- 2.39.2