]> git.donarmstrong.com Git - mothur.git/commitdiff
Thallinger changes to cluster command.
authorwestcott <westcott>
Thu, 30 Jul 2009 12:03:32 +0000 (12:03 +0000)
committerwestcott <westcott>
Thu, 30 Jul 2009 12:03:32 +0000 (12:03 +0000)
13 files changed:
averagelinkage.cpp
cluster.cpp
cluster.hpp
clustercommand.cpp
clustercommand.h
completelinkage.cpp
decalc.cpp
decalc.h
pintail.cpp
readdistcommand.cpp
singlelinkage.cpp
sparsematrix.cpp
sparsematrix.hpp

index 26283ff800ca1241be49d4e7eaf18a97d89617d4..7a4cb88d366d4f9765d906ce0fc7ea27600a6fdd 100644 (file)
 /***********************************************************************/
 
 AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
-{}
+       Cluster(rav, lv, dm)
+{
+       saveRow = -1;
+       saveCol = -1;
+}
+
+
+/***********************************************************************/
+//This function returns the tag of the method.
+string AverageLinkage::getTag() {
+       return("an");
+}
+
 
 /***********************************************************************/
-//THis function clusters based on the average method
-void AverageLinkage::update(){
-       try{
-               getRowColCells();               
-       
-               vector<int> found(nColCells, 0);
-       
-               int rowBin = rabund->get(smallRow);
-               int colBin = rabund->get(smallCol);
-               int totalBin = rowBin + colBin;
-       
-               for(int i=1;i<nRowCells;i++){
-               
-                       int search;
-               
-                       if(rowCells[i]->row == smallRow){
-                               search = rowCells[i]->column;
-                       }
-                       else{
-                               search = rowCells[i]->row;
-                       }
-               
-                       for(int j=1;j<nColCells;j++){
-                       
-                               if(colCells[j]->row == search || colCells[j]->column == search){
-                                       colCells[j]->dist = (colBin * colCells[j]->dist + rowBin * rowCells[i]->dist) / totalBin;               
-                               
-                                       found[j] = 1;
-                               
-                                       if(colCells[j]->vectorMap != NULL){
-                                               *(colCells[j]->vectorMap) = NULL;
-                                               colCells[j]->vectorMap = NULL;
-                                       }
-                               
-                                       break;
-                               }
-                       
-                       }                       
-                       dMatrix->rmCell(rowCells[i]);
-               }       
-       
-               clusterBins();
-               clusterNames();
-       
-               for(int i=0;i<nColCells;i++){
-                       if(found[i] == 0){
-                               dMatrix->rmCell(colCells[i]);
-                       }
+//This function updates the distance based on the average linkage method.
+bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+       try {
+               if ((saveRow != smallRow) || (saveCol != smallCol)) {
+                       rowBin = rabund->get(smallRow);
+                       colBin = rabund->get(smallCol);
+                       totalBin = rowBin + colBin;
+                       saveRow = smallRow;
+                       saveCol = smallCol;
                }
+
+               colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
+               return(true);
        }
        catch(exception& e) {
-               errorOut(e, "AverageLinkage", "update");
+               errorOut(e, "AverageLinkage", "updateDistance");
                exit(1);
        }
 }
 
 /***********************************************************************/
 
-#endif
 
+/***********************************************************************/
 
+#endif
index b9754bee094c28f7921506371fedf764c76fefd7..6fd463116e65becc0ef2a969c08d5bfaf6dfea15 100644 (file)
 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
 rabund(rav), list(lv), dMatrix(dm)
 {
+/*
+       cout << "sizeof(MatData): " << sizeof(MatData) << endl;
+       cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
+
+       int nCells = dMatrix->getNNodes();
+       time_t start = time(NULL);
+
+       MatVec matvec = MatVec(nCells); 
+       int i = 0;
+       for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+               matvec[i++] = currentCell;
+       }
+       for (i= matvec.size();i>0;i--) {
+               dMatrix->rmCell(matvec[i-1]);
+       }
+       MatData it = dMatrix->begin(); 
+       while (it != dMatrix->end()) { 
+               it = dMatrix->rmCell(it);
+       }
+       cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
+    exit(0);
+       MatData it = dMatrix->begin();
+       cout << it->row << "/" << it->column << "/" << it->dist << endl;
+       dMatrix->rmCell(dMatrix->begin());
+       cout << it->row << "/" << it->column << "/" << it->dist << endl;
+       exit(0);
+*/
+
+       // Create a data structure to quickly access the PCell information
+       // for a certain sequence. It consists of a vector of lists, where 
+       // a list contains pointers (iterators) to the all distances related
+       // to a certain sequence. The Vector is accessed via the index of a 
+       // sequence in the distance matrix.
+       seqVec = vector<MatVec>(lv->size());
+       for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+               seqVec[currentCell->row].push_back(currentCell);
+               seqVec[currentCell->column].push_back(currentCell);
+       }
 }
 
 /***********************************************************************/
 
-void Cluster::getRowColCells(){
+void Cluster::getRowColCells() {
        try {
                PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
        
-               smallRow = smallCell->row;              //get its row
-               smallCol = smallCell->column;   //get its column
-               smallDist = smallCell->dist;    //get the smallest distance
-       
-               rowCells.clear();
-               colCells.clear();
-               
-               for(MatData currentCell=dMatrix->begin();currentCell!=dMatrix->end();currentCell++){
-               
-                       if(&*currentCell == smallCell){                         //put the smallest cell first
-                               rowCells.insert(rowCells.begin(), currentCell);
-                               colCells.insert(colCells.begin(), currentCell);
-                       }
-                       else if(currentCell->row == smallRow){
-                               rowCells.push_back(currentCell);
-                       }
-                       else if(currentCell->column == smallRow){
-                               rowCells.push_back(currentCell);
-                       }
-                       else if(currentCell->row == smallCol){
-                               colCells.push_back(currentCell);
-                       }
-                       else if(currentCell->column == smallCol){
-                               colCells.push_back(currentCell);
-                       }
-               }
+               smallRow = smallCell->row;              // get its row
+               smallCol = smallCell->column;   // get its column
+               smallDist = smallCell->dist;    // get the smallest distance
        
+               rowCells = seqVec[smallRow];    // all distances related to the row index
+               colCells = seqVec[smallCol];    // all distances related to the column index
                nRowCells = rowCells.size();
                nColCells = colCells.size();
        }
@@ -59,39 +76,153 @@ void Cluster::getRowColCells(){
                errorOut(e, "Cluster", "getRowColCells");
                exit(1);
        }
+
 }
 
+// Remove the specified cell from the seqVec and from the sparse
+// matrix
+void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
+{
+       ull drow = cell->row;
+       ull dcol = cell->column;
+       if (((vrow >=0) && (drow != smallRow)) ||
+               ((vcol >=0) && (dcol != smallCol))) {
+               ull dtemp = drow;
+               drow = dcol;
+               dcol = dtemp;
+       }
+
+       ull crow;
+       ull ccol;
+       int nCells;
+       if (vrow < 0) {
+               nCells = seqVec[drow].size();
+               for (vrow=0; vrow<nCells;vrow++) {
+                       crow = seqVec[drow][vrow]->row;
+                       ccol = seqVec[drow][vrow]->column;
+                       if (((crow == drow) && (ccol == dcol)) ||
+                               ((ccol == drow) && (crow == dcol))) {
+                               break;
+                       }
+               }
+       }
+       seqVec[drow].erase(seqVec[drow].begin()+vrow);
+       if (vcol < 0) {
+               nCells = seqVec[dcol].size();
+               for (vcol=0; vcol<nCells;vcol++) {
+                       crow = seqVec[dcol][vcol]->row;
+                       ccol = seqVec[dcol][vcol]->column;
+                       if (((crow == drow) && (ccol == dcol)) ||
+                               ((ccol == drow) && (crow == dcol))) {
+                               break;
+                       }
+               }
+       }
+       seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
+       if (rmMatrix) {
+               dMatrix->rmCell(cell);
+       }
+}
+
+
 /***********************************************************************/
 
 void Cluster::clusterBins(){
        try {
-       
+       //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
+
                rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
                rabund->set(smallRow, 0);       
                rabund->setLabel(toString(smallDist));
 
+       //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
        }
        catch(exception& e) {
                errorOut(e, "Cluster", "clusterBins");
                exit(1);
        }
+
+
 }
 
 /***********************************************************************/
 
 void Cluster::clusterNames(){
        try {
-       
+       //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
+
                list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
                list->set(smallRow, "");        
                list->setLabel(toString(smallDist));
        
+       //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
     }
        catch(exception& e) {
                errorOut(e, "Cluster", "clusterNames");
                exit(1);
        }
+
 }
 
+/***********************************************************************/
+//This function clusters based on the method of the derived class
+//At the moment only average and complete linkage are covered, because
+//single linkage uses a different approach.
+void Cluster::update(){
+       try {
+               getRowColCells();       
+       
+               vector<int> found(nColCells, 0);
+               int search;
+               bool changed;
+
+               // The vector has to be traversed in reverse order to preserve the index
+               // for faster removal in removeCell()
+               for (int i=nRowCells-1;i>=0;i--) {
+                       if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
+                               if (rowCells[i]->row == smallRow) {
+                                       search = rowCells[i]->column;
+                               } else {
+                                       search = rowCells[i]->row;
+                               }
+               
+                               for (int j=0;j<nColCells;j++) {
+                                       if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+                                               if (colCells[j]->row == search || colCells[j]->column == search) {
+                                                       found[j] = 1;
+                                                       changed = updateDistance(colCells[j], rowCells[i]);
+                                                       // If the cell's distance changed and it had the same distance as 
+                                                       // the smallest distance, invalidate the mins vector in SparseMatrix
+                                                       if (changed) {
+                                                               if (colCells[j]->vectorMap != NULL) {
+                                                                       *(colCells[j]->vectorMap) = NULL;
+                                                                       colCells[j]->vectorMap = NULL;
+                                                               }
+                                                       }
+                                                       break;
+                                               }
+                                       }
+                               }
+                               removeCell(rowCells[i], i , -1);
+                       }
+               }
+               clusterBins();
+               clusterNames();
+
+               // Special handling for singlelinkage case, not sure whether this
+               // could be avoided
+               for (int i=nColCells-1;i>=0;i--) {
+                       if (found[i] == 0) {
+                               removeCell(colCells[i], -1, i);
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "Cluster", "update");
+               exit(1);
+       }
+}
+
+
 /***********************************************************************/
 
index de86f85502efc3b6416f5877102c16ca9e5694be..99d8b7d557a41cef684ad2f9aae7d636d621bbc7 100644 (file)
@@ -7,18 +7,22 @@
 
 class RAbundVector;
 class ListVector;
-class SparseMatrix;
 
-typedef list<PCell>::iterator MatData;
+typedef vector<MatData> MatVec;
 
 class Cluster {
        
 public:
        Cluster(RAbundVector*, ListVector*, SparseMatrix*);
-       virtual void update() = 0;
-       
+    virtual void update();
+       virtual string getTag() = 0;
+
 protected:     
        void getRowColCells();
+    void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true);
+
+       virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0;
+
        virtual void clusterBins();
        virtual void clusterNames();
        
@@ -29,8 +33,10 @@ protected:
        int smallRow;
        int smallCol;
        float smallDist;
-       vector<MatData> rowCells;
-       vector<MatData> colCells;
+       
+       vector<MatVec> seqVec;          // contains vectors of cells related to a certain sequence\r
+       MatVec rowCells;
+       MatVec colCells;
        ull nRowCells;
        ull nColCells;
 };
@@ -40,7 +46,8 @@ protected:
 class CompleteLinkage : public Cluster {
 public:
        CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-       void update();
+       bool updateDistance(MatData& colCell, MatData& rowCell);
+       string getTag();
        
 private:
                
@@ -51,7 +58,9 @@ private:
 class SingleLinkage : public Cluster {
 public:
        SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-       void update();
+    void update();
+       bool updateDistance(MatData& colCell, MatData& rowCell);
+       string getTag();
        
 private:
                
@@ -62,10 +71,16 @@ private:
 class AverageLinkage : public Cluster {
 public:
        AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*);
-       void update();
+       bool updateDistance(MatData& colCell, MatData& rowCell);
+       string getTag();
        
 private:
-               
+       int saveRow;
+       int saveCol;
+       int rowBin;
+       int colBin;
+       int totalBin;
+
 };
 
 /***********************************************************************/
index 593f2629ff9f1107ddaf6bd10c568047118c1070..d3b9bbf5a45d182e24b3ff8cc8bc45f6fcbc896a 100644 (file)
@@ -21,7 +21,7 @@ ClusterCommand::ClusterCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"cutoff","precision","method"};
+                       string Array[] =  {"cutoff","precision","method","showabund","timing"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -31,33 +31,43 @@ ClusterCommand::ClusterCommand(string option){
                
                        //check to make sure all parameters are valid for command
                        for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
-                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
+                                       abort = true;
+                               }
                        }
                        
                        //error checking to make sure they read a distance file
                        if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
-                               mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine(); abort = true;
+                               mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine();
+                               abort = true;
                        } 
                
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        //get user cutoff and precision or use defaults
                        string temp;
-                       temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
+                       temp = validParameter.validFile(parameters, "precision", false);
+                       if (temp == "not found") { temp = "100"; }
                        //saves precision legnth for formatting below
                        length = temp.length();
                        convert(temp, precision); 
                        
-                       temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10"; }
+                       temp = validParameter.validFile(parameters, "cutoff", false);
+                       if (temp == "not found") { temp = "10"; }
                        convert(temp, cutoff); 
                        cutoff += (5 / (precision * 10.0));
                        
-                       method = validParameter.validFile(parameters, "method", false);                 if (method == "not found") { method = "furthest"; }
-
+                       method = validParameter.validFile(parameters, "method", false);
+                       if (method == "not found") { method = "furthest"; }
                        
                        if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
                        else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
 
+                       showabund = validParameter.validFile(parameters, "showabund", false);
+                       if (showabund == "not found") { showabund = "T"; }
+
+                       timing = validParameter.validFile(parameters, "timing", false);
+                       if (timing == "not found") { timing = "F"; }
                        
                        if (abort == false) {
                        
@@ -70,25 +80,21 @@ ClusterCommand::ClusterCommand(string option){
                                }
                                
                                //create cluster
-                               if(method == "furthest")        {       cluster = new CompleteLinkage(rabund, list, matrix);    tag = "fn";     }
-                               else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix);              tag = "nn";     }
-                               else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix);             tag = "an";     }
-                               else                                            {       mothurOut("error - not recognized method"); mothurOutEndLine(); abort = true;   }       
-                               
+                               if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix); }
+                               else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix); }
+                               else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix);     }
+                               tag = cluster->getTag();
+
                                fileroot = getRootName(globaldata->inputFileName);
                        
                                openOutputFile(fileroot+ tag + ".sabund",       sabundFile);
                                openOutputFile(fileroot+ tag + ".rabund",       rabundFile);
                                openOutputFile(fileroot+ tag + ".list",         listFile);
-                               
-                               
                        }
-
                }
-               
        }
        catch(exception& e) {
-               errorOut(e, "ClusterCommand", "ClusterCommand");                
+               errorOut(e, "ClusterCommand", "ClusterCommand");
                exit(1);
        }
 }
@@ -97,11 +103,11 @@ ClusterCommand::ClusterCommand(string option){
 
 void ClusterCommand::help(){
        try {
-                mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
-                mothurOut("The cluster command parameter options are method, cuttoff and precision. No parameters are required.\n");
-                mothurOut("The cluster command should be in the following format: \n");
-                mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-                mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");
+               mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
+               mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
+               mothurOut("The cluster command should be in the following format: \n");
+               mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+               mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n"); 
        }
        catch(exception& e) {
                errorOut(e, "ClusterCommand", "help");
@@ -125,16 +131,28 @@ int ClusterCommand::execute(){
        
                if (abort == true) {    return 0;       }
                
+               time_t estart = time(NULL);
+               int ndist = matrix->getNNodes();
                float previousDist = 0.00000;
                float rndPreviousDist = 0.00000;
                oldRAbund = *rabund;
                oldList = *list;
+
+               print_start = true;
+               start = time(NULL);
+               loops = 0;
                
-               float x;
-               x=0.1;
-               toString(x, 2);
-       
-               while(matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+               while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+                       if (print_start && isTrue(timing)) {
+                               mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" 
+                                       + toString(roundDist(matrix->getSmallDist(), precision)) 
+                                       + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+                               cout.flush();
+                               print_start = false;
+                       }
+
+                       loops++;
+
                        cluster->update();
                        float dist = matrix->getSmallDist();
                        float rndDist = roundDist(dist, precision);
@@ -151,6 +169,13 @@ int ClusterCommand::execute(){
                        oldRAbund = *rabund;
                        oldList = *list;
                }
+
+               if (print_start && isTrue(timing)) {
+                       mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) 
+                                        + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+                       cout.flush();
+                       print_start = false;
+               }
        
                if(previousDist <= 0.0000){
                        printData("unique");
@@ -174,7 +199,9 @@ int ClusterCommand::execute(){
                sabundFile.close();
                rabundFile.close();
                listFile.close();
-               
+               if (isTrue(timing)) {
+                       mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster " + toString(ndist) + " distances"); mothurOutEndLine();
+               }
                return 0;
        }
        catch(exception& e) {
@@ -187,8 +214,18 @@ int ClusterCommand::execute(){
 
 void ClusterCommand::printData(string label){
        try {
+               if (isTrue(timing)) {
+                       mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) 
+                    + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine();
+               }
+               print_start = true;
+               loops = 0;
+               start = time(NULL);
+
                oldRAbund.setLabel(label);
-               oldRAbund.getSAbundVector().print(cout);
+               if (isTrue(showabund)) {
+                       oldRAbund.getSAbundVector().print(cout);
+               }
                oldRAbund.print(rabundFile);
                oldRAbund.getSAbundVector().print(sabundFile);
        
@@ -199,5 +236,7 @@ void ClusterCommand::printData(string label){
                errorOut(e, "ClusterCommand", "printData");
                exit(1);
        }
+
+
 }
 //**********************************************************************************************************************
index 44ce6f9f6833a8fd5181bcff64880bfe8f4d4f88..ad9fe85d9977f4c3ffb74d65bfa58ecc995f710c 100644 (file)
@@ -26,8 +26,6 @@
        The cluster() command outputs three files *.list, *.rabund, and *.sabund.   */
 
 
-class GlobalData;
-
 class ClusterCommand : public Command {
        
 public:
@@ -49,8 +47,13 @@ private:
 
        string method, fileroot, tag;
        double cutoff;
+       string showabund, timing;
        int precision, length;
        ofstream sabundFile, rabundFile, listFile;
+
+       bool print_start;
+       time_t start;
+       unsigned long loops;
        
        void printData(string label);
 };
index f69753b34a3ecdfb74fbd9e5fe068954c1cd493f..09ba9639fdbf4146fc955a1c2ac23e696bb62cd6 100644 (file)
@@ -1,64 +1,32 @@
 
 #include "cluster.hpp"
-#include "mothur.h"
 
 /***********************************************************************/
 
 CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
+       Cluster(rav, lv, dm)
 {}
 
 /***********************************************************************/
-//This function clusters based on the furthest neighbor method.
-void CompleteLinkage::update(){
+//This function returns the tag of the method.
+string CompleteLinkage::getTag() {
+       return("fn");
+}
+
+
+/***********************************************************************/
+//This function updates the distance based on the furthest neighbor method.
+bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
        try {
-               getRowColCells();       
-       
-               vector<int> found(nColCells, 0);
-       
-               for(int i=1;i<nRowCells;i++){
-               
-                       int search;
-               
-                       if(rowCells[i]->row == smallRow){
-                               search = rowCells[i]->column;
-                       }
-                       else{
-                               search = rowCells[i]->row;
-                       }
-               
-                       for(int j=1;j<nColCells;j++){
-                       
-                               if(colCells[j]->row == search || colCells[j]->column == search){
-                               
-                                       if(colCells[j]->dist < rowCells[i]->dist){
-                                               colCells[j]->dist = rowCells[i]->dist;
-                                       
-                                               if(colCells[j]->vectorMap != NULL){
-                                                       *(colCells[j]->vectorMap) = NULL;
-                                                       colCells[j]->vectorMap = NULL;
-                                               }
-                                       
-                                       }
-                               
-                                       found[j] = 1;
-                                       break;
-                               }
-                       }
-                       dMatrix->rmCell(rowCells[i]);
-               
+               bool changed = false;
+               if (colCell->dist < rowCell->dist) {
+                       colCell->dist = rowCell->dist;
+                       changed = true;
                }       
-               clusterBins();
-               clusterNames();
-       
-               for(int i=0;i<nColCells;i++){
-                       if(found[i] == 0){
-                               dMatrix->rmCell(colCells[i]);
-                       }
-               }
+               return(changed);
        }
        catch(exception& e) {
-               errorOut(e, "CompleteLinkage", "update");
+               errorOut(e, "CompleteLinkage", "updateDistance");
                exit(1);
        }
 }
index 1a7349492a99879dc3bb334893ff8ee7f36ae0bd..203482f0819ca5658656d582a4412ed4e0b48632 100644 (file)
@@ -59,11 +59,13 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& t
                
                int front = 0;
                for (int i = 0; i < q.length(); i++) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
                        if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
                }
-               
+//cout << endl << endl;                
                int back = 0;           
                for (int i = q.length(); i >= 0; i--) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
                        if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
                }
                
@@ -97,10 +99,10 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
                //count bases
                int numBases = 0;
                for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
-                       
+//cout << "num Bases = " << numBases << endl;                  
                //save start of seq
                win.push_back(front);
-               
+//cout << front << '\t';               
                //move ahead increment bases at a time until all bases are in a window
                int countBases = 0;
                int totalBases = 0;  //used to eliminate window of blanks at end of sequence
@@ -109,18 +111,27 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
                for (int m = front; m < (back - size) ; m++) {
                                
                        //count number of bases you see
-                       if (isalpha(seq[m])) { countBases++; totalBases++;  }
+                       if (isalpha(seq[m])) { countBases++;  }
                                
                        //if you have seen enough bases to make a new window
                        if (countBases >= increment) {
+                               //total bases is the number of bases in a window already.
+                               totalBases += countBases;
+//cout << "total bases = " << totalBases << endl;
                                win.push_back(m);  //save spot in alignment
+//cout << m << '\t';
                                countBases = 0;                         //reset bases you've seen in this window
                        }
                                
                        //no need to continue if all your bases are in a window
                        if (totalBases == numBases) {   break;  }
                }
-                       
+       
+
+               //get last window if needed
+               if (totalBases < numBases) {   win.push_back(back-size);  cout << back-size << endl;}
+//cout << endl << endl;
+               
                return win;
        
        }
@@ -239,6 +250,12 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
                
                openOutputFile(freqfile, outFreq);
                
+               string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
+               int precision = length.length() - 1;
+               
+               //format output
+               outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
+               
                //at each position in the sequence
                for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
                        
@@ -261,11 +278,8 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
                        int highest = 0;
                        for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
                        
-                       float highFreq;
-                       //subtract gaps to "ignore them"
-                       if ( (seqs.size() - gaps) == 0 ) {  highFreq = 1.0;  }                  
-                       else { highFreq = highest / (float) (seqs.size() - gaps);        }
-                                               
+                       float highFreq = highest / (float) (seqs.size());       
+                       
                        float Pi;
                        Pi =  (highFreq - 0.25) / 0.75; 
                        
@@ -273,10 +287,9 @@ vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
                        if (Pi < 0) { Pi = 0.0; }
                        
                        //saves this for later
-                       outFreq << i+1 << '\t' << highFreq << endl;
-                       
+                       outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
+       
                        if (h.count(i) > 0) {
-       cout << i+1 << '\t' << highFreq << endl;
                                prob.push_back(Pi); 
                        }
                }
@@ -387,6 +400,52 @@ vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vecto
                exit(1);
        }
 }
+//***************************************************************************************************************
+void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
+       try {
+               
+       
+               for (int i = 0; i < quantiles.size(); i++) {
+               
+                       //find mean of this quantile score
+                       float average = findAverage(quantiles[i]);
+                       
+                       vector<float> newQuanI;
+                       //look at each value in quantiles to see if it is an outlier
+                       for (int j = 0; j < quantiles[i].size(); j++) {
+                               
+                               float highCutoff, lowCutOff;
+                               
+                               
+                       
+                       }
+               
+               }
+
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "removeObviousOutliers");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+float DeCalculator::findAverage(vector<float> myVector) {
+       try{
+               
+               float total = 0.0;
+               for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
+               
+               float average = total / (float) myVector.size();
+               
+               return average;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "findAverage");
+               exit(1);
+       }
+}
 
 //***************************************************************************************************************
 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
@@ -402,7 +461,8 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
                for (int j = 0; j < obs.size(); j++) {   obsAverage += obs[j];  }
                obsAverage = obsAverage / (float) obs.size();
 //cout << "sum ai / m = " << probAverage << endl;              
-//cout << "sum oi / m = " << obsAverage << endl;
+//cout << "sum oi / m = " << obsAverage << endl;       
+       
                float coef = obsAverage / probAverage;
                                                
                return coef;
index 9309c37378d8c08c0fc7e1c270f272a153e555f4..1579d86cb8950c2cbbc857407233e1014283344f 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -31,6 +31,7 @@ class DeCalculator {
                void setMask(string m); 
                void runMask(Sequence*);
                void trimSeqs(Sequence*, Sequence*, map<int, int>&);
+               void removeObviousOutliers(vector< vector<float> >&);
                vector<float> calcFreq(vector<Sequence*>, string);
                vector<int> findWindows(Sequence*, int, int, int&, int);
                vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
@@ -42,6 +43,7 @@ class DeCalculator {
                vector< vector<float> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
                
        private:
+               float findAverage(vector<float> myVector);
                string seqMask;
                set<int> h;
 
index b7e5b5fb137fa45b977766c713efafb9bee03cef..bca7df96deabdd28a8f9e83423b6c1608073215e 100644 (file)
@@ -34,6 +34,11 @@ void Pintail::print(ostream& out) {
                for (int i = 0; i < querySeqs.size(); i++) {
                        
                        int index = ceil(deviation[i]);
+float quan = 2.64 * log10(deviation[i]) + 1.46;
+cout << "dist = " << index << endl;
+cout << "de = "  << DE[i] << endl;
+cout << "mallard quantile = " << quan << endl;
+cout << "my quantile = " << quantiles[index][4] << endl;
                        
                        //is your DE value higher than the 95%
                        string chimera;
@@ -158,7 +163,7 @@ void Pintail::getChimeras() {
                }else                           {   probabilityProfile = readFreq();                      }
 
                //make P into Q
-               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];       cout << i << '\t' << probabilityProfile[i] << endl;  }
+               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];   }  //cout << i << '\t' << probabilityProfile[i] << endl;
                mothurOut("Done."); mothurOutEndLine();
                
                //mask querys
@@ -270,6 +275,9 @@ cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
                                quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
                        }else {         createProcessesQuan();          }
                        
+                       
+                       decalc->removeObviousOutliers(quantiles);
+                       
                        ofstream out4;
                        string o = getRootName(templateFile) + "quan";
                        
@@ -344,7 +352,7 @@ vector<float> Pintail::readFreq() {
                        
                        in >> pos >> num;
                        
-                       if (h.count(pos-1) > 0) {
+                       if (h.count(pos) > 0) {
                                float Pi;
                                Pi =  (num - 0.25) / 0.75; 
                        
index 088f711e0d6dd6f191493bbaf802db3a7bae1fbb..579f848bdb75b10b1350d1eb61b589de8f543ee0 100644 (file)
@@ -70,7 +70,7 @@ ReadDistCommand::ReadDistCommand(string option){
                        else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a read.dist command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
                
                        if (columnfile != "") {
-                               if (namefile == "") {  mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+                               if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
                        }
                
                        //check for optional parameter and set defaults
@@ -129,7 +129,7 @@ void ReadDistCommand::help(){
                mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
                mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
                mothurOut("For this use the read.dist command should be in the following format: \n");
-               mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n");
+               mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use.  \n");
                mothurOut("Note: No spaces between parameter labels (i.e. phylip), '=' and parameters (i.e.yourPhylipfile).\n\n");
        }
        catch(exception& e) {
@@ -141,6 +141,9 @@ void ReadDistCommand::help(){
 //**********************************************************************************************************************
 
 ReadDistCommand::~ReadDistCommand(){
+       if (abort == false) {
+               if (format != "matrix") { delete read; delete nameMap; }
+       }
 }
 
 //**********************************************************************************************************************
@@ -148,6 +151,9 @@ int ReadDistCommand::execute(){
        try {
                
                if (abort == true) {    return 0;       }
+
+               time_t start = time(NULL);
+               size_t numDists = 0;
                
                if (format == "matrix") {
                        ifstream in;
@@ -157,7 +163,8 @@ int ReadDistCommand::execute(){
                        //memory leak prevention
                        if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix;  }
                        globaldata->gMatrix = matrix; //save matrix for coverage commands
-               }else {
+                       numDists = matrix->getSizes()[1];
+               } else {
                        read->read(nameMap);
                        //to prevent memory leak
 
@@ -166,10 +173,36 @@ int ReadDistCommand::execute(){
 
                        if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix;  }
                        globaldata->gSparseMatrix = read->getMatrix();
-                       delete read; delete nameMap;
+                       numDists = globaldata->gSparseMatrix->getNNodes();
+      int lines = cutoff / (1.0/precision);
+      vector<float> dist_cutoff(lines+1,0);
+                       for (int i = 0; i <= lines;i++) {       
+       dist_cutoff[i] = (i + 0.5) / precision; 
+      } 
+      vector<int> dist_count(lines+1,0);
+      list<PCell>::iterator currentCell;
+      SparseMatrix* smatrix = globaldata->gSparseMatrix;
+               for (currentCell = smatrix->begin(); currentCell != smatrix->end(); currentCell++) {
+                               for (int i = 0; i <= lines;i++) {       
+                                       if (currentCell->dist < dist_cutoff[i]) {
+                                               dist_count[i]++;
+            break;
+          }
+        }
+                       }
 
+      string dist_string = "Dist:";
+      string count_string = "Count: ";
+                       for (int i = 0; i <= lines;i++) {       
+       dist_string = dist_string.append("\t").append(toString(dist_cutoff[i]));
+       count_string = count_string.append("\t").append(toString(dist_count[i]));
+                       }
+      mothurOut(dist_string); mothurOutEndLine(); mothurOut(count_string); mothurOutEndLine();
                }
+               mothurOut("It took " + toString(time(NULL) - start) + " secs to read " + toString(numDists) + " distances (cutoff: " + toString(cutoff) + ")"); mothurOutEndLine();
                return 0;
+               
        }
        catch(exception& e) {
                errorOut(e, "ReadDistCommand", "execute");
index 5332e907d27b86d13f40c411baa4a96cbf879f40..c27b14e727057c6dcd14f2b0268cd250c4d52559 100644 (file)
@@ -9,54 +9,75 @@ SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm
 Cluster(rav, lv, dm)
 {}
 
+
 /***********************************************************************/
-//This function clusters based on nearest method.
-void SingleLinkage::update(){
+//This function returns the tag of the method.
+string SingleLinkage::getTag() {
+       return("nn");
+}
+
+/***********************************************************************/
+//This function clusters based on the single linkage method.
+void  SingleLinkage::update(){
        try {
-               getRowColCells();               
+               getRowColCells();       
        
-               for(int i=1;i<nRowCells;i++){
-               
-                       int search;
-               
-                       if(rowCells[i]->row == smallRow){
-                               search = rowCells[i]->column;
-                       }
-                       else{
-                               search = rowCells[i]->row;
-                       }
+               vector<bool> deleted(nRowCells, false);
+               int rowInd;
+               int search;
+               bool changed;
+
+               // The vector has to be traversed in reverse order to preserve the index
+               // for faster removal in removeCell()
+               for (int i=nRowCells-1;i>=0;i--) {
+                       if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) {
+                               rowInd = i;   // The index of the smallest distance cell in rowCells
+                       } else {
+                               if (rowCells[i]->row == smallRow) {
+                                       search = rowCells[i]->column;
+                               } else {
+                                       search = rowCells[i]->row;
+                               }
                
-                       for(int j=1;j<nColCells;j++){
-                       
-                               if(colCells[j]->row == search || colCells[j]->column == search){
-                               
-                                       if(colCells[j]->dist > rowCells[i]->dist){
-                                               colCells[j]->dist = rowCells[i]->dist;
-                                       
-                                               if(colCells[j]->vectorMap != NULL){
-                                                       *(colCells[j]->vectorMap) = NULL;
-                                                       colCells[j]->vectorMap = NULL;
+                               for (int j=0;j<nColCells;j++) {
+                                       if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+                                               if (colCells[j]->row == search || colCells[j]->column == search) {
+                                                       changed = updateDistance(colCells[j], rowCells[i]);
+                                                       // If the cell's distance changed and it had the same distance as 
+                                                       // the smallest distance, invalidate the mins vector in SparseMatrix
+                                                       if (changed) {
+                                                               if (colCells[j]->vectorMap != NULL) {
+                                                                       *(colCells[j]->vectorMap) = NULL;
+                                                                       colCells[j]->vectorMap = NULL;
+                                                               }
+                                                       }
+                                                       removeCell(rowCells[i], i , -1);
+                                                       deleted[i] = true;
+                                                       break;
                                                }
-                                       
                                        }
-                                       dMatrix->rmCell(rowCells[i]);
-                                       break;
                                }
-                       }                       
-               
-                       if(search < smallCol){
-                               rowCells[i]->row = smallCol;
-                               rowCells[i]->column = search;
-                       }
-                       else{
-                               rowCells[i]->row = search;
-                               rowCells[i]->column = smallCol;
-                       }
-               
-               }       
+                               if (!deleted[i]) {
+                                       // Assign the cell to the new cluster 
+                                       // remove the old cell from seqVec and add the cell
+                                       // with the new row and column assignment again
+                                       removeCell(rowCells[i], i , -1, false);
+                                       if (search < smallCol){
+                                               rowCells[i]->row = smallCol;
+                                               rowCells[i]->column = search;
+                                       } else {
+                                               rowCells[i]->row = search;
+                                               rowCells[i]->column = smallCol;
+                                       }
+                                       seqVec[rowCells[i]->row].push_back(rowCells[i]);
+                                       seqVec[rowCells[i]->column].push_back(rowCells[i]);
+                               }
+                       }       
+               }
                clusterBins();
                clusterNames();
-               dMatrix->rmCell(rowCells[0]);
+               // remove also the cell with the smallest distance
+               removeCell(rowCells[rowInd], -1 , -1);
        }
        catch(exception& e) {
                errorOut(e, "SingleLinkage", "update");
@@ -64,4 +85,20 @@ void SingleLinkage::update(){
        }
 }
 
+
+/***********************************************************************/
+//This function updates the distance based on the nearest neighbor method.
+bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+       try {
+               bool changed = false;
+               if (colCell->dist > rowCell->dist) {
+                       colCell->dist = rowCell->dist;
+               }
+               return(changed);
+       }
+       catch(exception& e) {
+               errorOut(e, "SingleLinkage", "updateDistance");
+               exit(1);
+       }
+}
 /***********************************************************************/
index 7f1c3c8564d29c6b1c551f7ee9bd16b58636af0a..8a873f74c2441c4dd50f20450ebe4882227ce32b 100644 (file)
@@ -2,7 +2,6 @@
 #include "sparsematrix.hpp"
 #include "listvector.hpp"
 
-typedef list<PCell>::iterator MatData;
 
 /***********************************************************************/
 
@@ -22,15 +21,15 @@ float SparseMatrix::getSmallDist(){
 
 /***********************************************************************/
 
-void SparseMatrix::rmCell(MatData data){
+MatData SparseMatrix::rmCell(MatData data){
        try {
                if(data->vectorMap != NULL ){
                        *(data->vectorMap) = NULL;
                        data->vectorMap = NULL;
                }
-               matrix.erase(data);             
+               data = matrix.erase(data);              
                numNodes--;
-       
+               return(data);
        //  seems like i should be updating smallDist here, but the only time we remove cells is when
        //  clustering and the clustering algorithm updates smallDist
        }
@@ -89,13 +88,11 @@ MatData SparseMatrix::end(){
 void SparseMatrix::print(){
        try {
                int index = 0;
-               
-               mothurOutEndLine();
-               mothurOut("Index\tRow\tColumn\tDistance");
-               mothurOutEndLine();
+       
+               cout << endl << "Index\tRow\tColumn\tDistance" << endl;
        
                for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
-                       mothurOut(toString(index) + "\t" + toString(currentCell->row)  + "\t" + toString(currentCell->column) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
+                       cout << index << '\t' << currentCell->row  << '\t' << currentCell->column << '\t' << currentCell->dist << endl;
                        index++;
                }
        }
@@ -111,10 +108,7 @@ void SparseMatrix::print(ListVector* list){
        try {
                int index = 0;
        
-               mothurOutEndLine();
-               mothurOut("Index\tRow\tColumn\tDistance");
-               mothurOutEndLine();
-
+               mothurOutEndLine(); mothurOut("Index\tRow\tColumn\tDistance"); mothurOutEndLine();
        
                for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
                        mothurOut(toString(index) + "\t" + toString(list->get(currentCell->row))  + "\t" + toString(list->get(currentCell->column)) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
@@ -159,7 +153,7 @@ PCell* SparseMatrix::getSmallestCell(){
                                }
 
                        }
-                       random_shuffle(mins.begin(), mins.end());  //randomize the order of the iterators in the mins vector
+//                     random_shuffle(mins.begin(), mins.end());  //randomize the order of the iterators in the mins vector
 
                        for(int i=0;i<mins.size();i++){
                                mins[i]->vectorMap = &mins[i];  //assign vectorMap to the address for the container
index d65771e75a4d8a2785535fd2ae9cd493f2c1aa1f..8eeda7ff2f9899748bde34f92fac5a25dcfd3964 100644 (file)
@@ -20,21 +20,23 @@ class ListVector;
 
 /***********************************************************************/
 
+typedef list<PCell>::iterator MatData;
+
 class SparseMatrix {
        
 public:
        SparseMatrix();
        int getNNodes();
        void print();                                   //Print the contents of the matrix
-       void print(ListVector*);                                        //Print the contents of the matrix
+       void print(ListVector*);                //Print the contents of the matrix
        PCell* getSmallestCell();               //Return the cell with the smallest distance
        float getSmallDist();
        
-       void rmCell(list<PCell>::iterator);
+       MatData rmCell(MatData);
        void addCell(PCell);
        void clear();
-       list<PCell>::iterator begin();
-       list<PCell>::iterator end();
+       MatData begin();
+       MatData end();
 
 private:
        PCell* smallCell;                               //The cell with the smallest distance