]> git.donarmstrong.com Git - mothur.git/blobdiff - clusterclassic.cpp
fixes while testing 1.33.0
[mothur.git] / clusterclassic.cpp
index d0a63b1e2e4d47782839844595727d4420a9e873..32a9341613d07274d397c1ea9cc9d6b8d9383826 100644 (file)
 #include "progress.hpp"
 
 /***********************************************************************/
-ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), nseqs(0) {
+ClusterClassic::ClusterClassic(float c, string f, bool s) : method(f), smallDist(1e6), nseqs(0), sim(s) {
        try {
                mapWanted = false;  //set to true by mgcluster to speed up overlap merge
        
                //save so you can modify as it changes in average neighbor
                cutoff = c;
-               aboveCutoff = cutoff + 1.0;
+               aboveCutoff = cutoff + 10000.0;
                m = MothurOut::getInstance();
-               globaldata = GlobalData::getInstance();
+        if(method == "furthest")        {   tag = "fn";   }
+        else if (method == "average")   {   tag = "an";   }
+        else if (method == "weighted")  {   tag = "wn";   }        
+        else if (method == "nearest")   {   tag = "nn";   }
        }
        catch(exception& e) {
                m->errorOut(e, "ClusterClassic", "ClusterClassic");
@@ -37,7 +40,12 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                ifstream fileHandle;
                m->openInputFile(filename, fileHandle);
                
-               fileHandle >> nseqs >> name;
+        string numTest;
+               fileHandle >> numTest >> name;
+        
+        if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+        else { convert(numTest, nseqs); }
+
 
                matrixNames.push_back(name);
 
@@ -102,14 +110,14 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                                                                                fileHandle >> distance;
                                                        
                                                                                if (distance == -1) { distance = 1000000; }
-                                                                               else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                                                                               else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
                                                                
-                                                                               if(distance < cutoff){
+                                                                               //if(distance < cutoff){
                                                                                        dMatrix[i][j] = distance;
                                                                                        if (distance < smallDist) { smallDist = distance; }
                                                                                        //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
                                                                                        //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
-                                                                               }
+                                                                               //}
                                                                                index++;
                                                                                reading->update(index);
                                                                }
@@ -124,9 +132,9 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                                                                                if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
                                
                                                                                if (distance == -1) { distance = 1000000; }
-                                                                               else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                                                                               else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
                                                                                
-                                                                               if(distance < cutoff){
+                                                                               //if(distance < cutoff){
                                                                                        if (distance < smallDist) { smallDist = distance; }
                                                                                        
                                                                                        int row = nameMap->get(matrixNames[i]);
@@ -137,7 +145,7 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                                                                                        
                                                                                        //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
                                                                                        //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
-                                                                               }
+                                                                               //}
                                                                                index++;
                                                                                reading->update(index);
                                                                }
@@ -162,9 +170,9 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                                                                                if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
                                                                                
                                                                                if (distance == -1) { distance = 1000000; }
-                                                                               else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                                                                               else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
                                                                                
-                                                                               if(distance < cutoff && j < i){
+                                                                               if(j < i){
                                                                                        if (distance < smallDist) { smallDist = distance; }
                                                                                        
                                                                                        dMatrix[i][j] = distance;
@@ -185,9 +193,9 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                                                                                if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
                                                                                
                                                                           if (distance == -1) { distance = 1000000; }
-                                                                               else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
+                                                                               else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
                                                                                
-                                                                               if(distance < cutoff && j < i){
+                                                                               if(j < i){
                                                                                        if (distance < smallDist) { smallDist = distance; }
                                                                                        
                                                                                        int row = nameMap->get(matrixNames[i]);
@@ -215,18 +223,219 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                rabund = new RAbundVector(list->getRAbundVector());
                
                fileHandle.close();
+               
+               return 0;
        }
        catch(exception& e) {
                m->errorOut(e, "ClusterClassic", "readPhylipFile");
                exit(1);
        }
 
+}
+/***********************************************************************/
+int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) {
+       try {
+               double distance;
+               int square;
+               string name;
+               vector<string> matrixNames;
+               
+               ifstream fileHandle;
+               m->openInputFile(filename, fileHandle);
+               
+        string numTest;
+               fileHandle >> numTest >> name;
+        
+        if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+        else { convert(numTest, nseqs); }
+        
+        
+               matrixNames.push_back(name);
+        
+               if(countTable == NULL){
+            list = new ListVector(nseqs);
+            list->set(0, name);
+        }
+        else{  list = new ListVector(countTable->getListVector()); }
+
+               
+               //initialize distance matrix to cutoff
+               dMatrix.resize(nseqs);
+               //rowSmallDists.resize(nseqs, temp);
+               for (int i = 1; i < nseqs; i++) {                       
+                       dMatrix[i].resize(i, aboveCutoff);              
+               }                                                                                               
+        
+               
+               char d;
+               while((d=fileHandle.get()) != EOF){
+            
+            if(isalnum(d)){
+                square = 1;
+                fileHandle.putback(d);
+                for(int i=0;i<nseqs;i++){
+                    fileHandle >> distance;
+                }
+                break;
+            }
+            if(d == '\n'){
+                square = 0;
+                break;
+            }
+               }
+        
+               Progress* reading;
+        
+               if(square == 0){
+            
+            reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
+            
+            int        index = 0;
+            
+            for(int i=1;i<nseqs;i++){
+                if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                
+                fileHandle >> name;
+                matrixNames.push_back(name);
+                
+                
+                //there's A LOT of repeated code throughout this method...
+                 if(countTable == NULL){
+                    list->set(i, name);
+                    
+                    for(int j=0;j<i;j++){
+                        
+                        if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
+                        
+                        fileHandle >> distance;
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        //if(distance < cutoff){
+                        dMatrix[i][j] = distance;
+                        if (distance < smallDist) { smallDist = distance; }
+                        //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
+                        //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
+                        //}
+                        index++;
+                        reading->update(index);
+                    }
+                    
+                }
+                else{
+                    for(int j=0;j<i;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        if (distance < smallDist) { smallDist = distance; }
+                        
+                        int row = countTable->get(matrixNames[i]);
+                        int col = countTable->get(matrixNames[j]);
+                       
+                        if (row < col) {  dMatrix[col][row] = distance; }
+                        else { dMatrix[row][col] = distance; }
+                        
+                        index++;
+                        reading->update(index);
+                    }
+                }
+            }
+               }
+               else{
+            
+            reading = new Progress("Reading matrix:     ", nseqs * nseqs);
+            
+            int index = nseqs;
+            
+            for(int i=1;i<nseqs;i++){
+                fileHandle >> name;                
+                matrixNames.push_back(name);
+                
+                if(countTable == NULL){
+                    list->set(i, name);
+                    for(int j=0;j<nseqs;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        if(j < i){
+                            if (distance < smallDist) { smallDist = distance; }
+                            
+                            dMatrix[i][j] = distance;
+                        }
+                        index++;
+                        reading->update(index);
+                    }
+                    
+                }
+                else{
+                    
+                    for(int j=0;j<nseqs;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
+                        
+                        if(j < i){
+                            if (distance < smallDist) { smallDist = distance; }
+                            
+                            int row = countTable->get(matrixNames[i]);
+                            int col = countTable->get(matrixNames[j]);
+                            
+                            if (row < col) {  dMatrix[col][row] = distance; }
+                            else { dMatrix[row][col] = distance; }
+                        }
+                        index++;
+                        reading->update(index);
+                    }
+                }
+            }
+               }
+               
+               if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+               
+               reading->finish();
+               delete reading;
+        
+               list->setLabel("0");
+               rabund = new RAbundVector();
+        rabund->setLabel(list->getLabel());  
+        
+        for(int i = 0; i < list->getNumBins(); i++) { 
+            if (m->control_pressed) { break; }
+            vector<string> binNames;
+            string bin = list->get(i);
+            m->splitAtComma(bin, binNames);
+            int total = 0;
+            for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]);  }
+            rabund->push_back(total);   
+        }
+               
+               fileHandle.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterClassic", "readPhylipFile");
+               exit(1);
+       }
+    
 }
 /***********************************************************************/
 //sets smallCol and smallRow, returns distance
 double ClusterClassic::getSmallCell() {
        try {
-               
+                       
                smallDist = aboveCutoff;
                smallRow = 1;
                smallCol = 0;
@@ -279,6 +488,10 @@ void ClusterClassic::clusterBins(){
 
                rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));     
                rabund->set(smallCol, 0);       
+               /*for (int i = smallCol+1; i < rabund->size(); i++) {
+                       rabund->set((i-1), rabund->get(i));
+               }
+               rabund->resize((rabund->size()-1));*/
                rabund->setLabel(toString(smallDist));
 
        //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
@@ -296,6 +509,10 @@ void ClusterClassic::clusterNames(){
                
                list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
                list->set(smallCol, "");        
+               /*for (int i = smallCol+1; i < list->size(); i++) {
+                       list->set((i-1), list->get(i));
+               }
+               list->resize((list->size()-1));*/
                list->setLabel(toString(smallDist));
        
        //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
@@ -308,103 +525,62 @@ void ClusterClassic::clusterNames(){
 /***********************************************************************/
 void ClusterClassic::update(double& cutOFF){
        try {
-//cout << "before update " << endl;
 //print();             
                getSmallCell();
                
                int r, c;
                r = smallRow; c = smallCol;
-               //because we only store lt, we need to make sure we grab the right location
-               //if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value
-               //else { r = smallRow; c = smallCol; } //smallRow is the row value
-               
-               //reset rows smallest distance
-               //rowSmallDists[r].dist = aboveCutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0;
-               //rowSmallDists[c].dist = aboveCutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0;
-               
-               //if your rows smallest distance is from smallRow or smallCol, reset
-               //for(int i=0;i<nseqs;i++){
-                       //if ((rowSmallDists[i].row == r) || (rowSmallDists[i].row == c) || (rowSmallDists[i].col == r) || (rowSmallDists[i].col == c)) {
-                       //      rowSmallDists[i].dist = aboveCutoff; rowSmallDists[i].row = 0; rowSmallDists[i].col = 0;
-                       //}
-               //}
-               
+                               
                for(int i=0;i<nseqs;i++){
                        if(i != r && i != c){
                                double distRow, distCol, newDist;
                                if (i > r) { distRow = dMatrix[i][r]; }
                                else { distRow =  dMatrix[r][i]; }
-                               
+
                                if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
                                else { distCol =  dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
-                                       
+                               
                                if(method == "furthest"){
                                        newDist = max(distRow, distCol);
                                }
                                else if (method == "average"){
-                                       if ((distRow == aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                       }else if ((distRow == aboveCutoff) && (distCol != aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                               if (cutOFF > distCol) { cutOFF = distCol; }
-                                       }else if ((distRow != aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                               if (cutOFF > distRow) { cutOFF = distRow; }
-                                       }else {
-                                               int rowBin = rabund->get(r);
-                                               int colBin = rabund->get(c);
-                                               newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
-                                       }
+                                       int rowBin = rabund->get(r);
+                                       int colBin = rabund->get(c);
+                                       newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
                                }
                                else if (method == "weighted"){
-                                       if ((distRow == aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                       }else if ((distRow == aboveCutoff) && (distCol != aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                               if (cutOFF > distCol) { cutOFF = distCol; }
-                                       }else if ((distRow != aboveCutoff) && (distCol == aboveCutoff)) { //you are merging with a value above cutoff
-                                               newDist = aboveCutoff; //eliminate value
-                                               if (cutOFF > distRow) { cutOFF = distRow; }
-                                       }else {
-                                               newDist = (distCol + distRow) / 2.0;
-                                       }
+                                       newDist = (distCol + distRow) / 2.0;
                                }
                                else if (method == "nearest"){
                                        newDist = min(distRow, distCol);
                                }
-                               
+                               //cout << "newDist = " << newDist << endl;      
                                if (i > r) { dMatrix[i][r] = newDist; }
                                else { dMatrix[r][i] = newDist; }
                                
-                               //if (newDist < rowSmallDists[i].dist) {  rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i;  }
                        }
-                       //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl;  
                }
                        
                clusterBins();
                clusterNames();
-       
-               //find new small for 2 rows we just merged
-               //colDist temp(0,0,100.0);
-               //rowSmallDists[r] = temp;
+               
+               //resize each row
+               /*for(int i=0;i<nseqs;i++){
+                       for(int j=c+1;j<dMatrix[i].size();j++){
+                               dMatrix[i][j-1]=dMatrix[i][j];
+                       }
+               }                       
+               
+               //resize each col
+               for(int i=c+1;i<nseqs;i++){
+                       for(int j=0;j < dMatrix[i-1].size();j++){
+                               dMatrix[i-1][j]=dMatrix[i][j];
+                       }
+               }       
+               
+               nseqs--;
+               dMatrix.pop_back();*/
 
-               //for (int i = 0; i < dMatrix[r].size(); i++) {
-               //      if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
-               //}
-               //for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) {
-               //      if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
-               //}
-               
-               //rowSmallDists[c] = temp;
-               //for (int i = 0; i < dMatrix[c].size(); i++) {
-               //      if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
-               //}
-               //for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) {
-               //      if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
-               //}
-               
-               //cout << "after update " << endl;
-               //print();
        }
        catch(exception& e) {
                m->errorOut(e, "ClusterClassic", "update");
@@ -421,16 +597,12 @@ void ClusterClassic::setMapWanted(bool f)  {
                        
                        //parse bin 
                        string names = list->get(i);
-                       while (names.find_first_of(',') != -1) { 
-                               //get name from bin
-                               string name = names.substr(0,names.find_first_of(','));
+                       vector<string> binnames;
+            m->splitAtComma(names, binnames);
+            for (int j = 0; j < binnames.size(); j++) {
                                //save name and bin number
-                               seq2Bin[name] = i;
-                               names = names.substr(names.find_first_of(',')+1, names.length());
+                               seq2Bin[binnames[j]] = i;
                        }
-                       
-                       //get last name
-                       seq2Bin[names] = i;
                }
                
        }
@@ -443,17 +615,13 @@ void ClusterClassic::setMapWanted(bool f)  {
 void ClusterClassic::updateMap() {
 try {
                //update location of seqs in smallRow since they move to smallCol now
-               string names = list->get(smallRow);
-               while (names.find_first_of(',') != -1) { 
-                       //get name from bin
-                       string name = names.substr(0,names.find_first_of(','));
-                       //save name and bin number
-                       seq2Bin[name] = smallCol;
-                       names = names.substr(names.find_first_of(',')+1, names.length());
-               }
-                       
-               //get last name
-               seq2Bin[names] = smallCol;
+        string names = list->get(smallRow);
+        vector<string> binnames;
+        m->splitAtComma(names, binnames);
+        for (int j = 0; j < binnames.size(); j++) {
+            //save name and bin number
+            seq2Bin[binnames[j]] = smallCol;
+        }
                
        }
        catch(exception& e) {
@@ -466,11 +634,11 @@ void ClusterClassic::print() {
 try {
                //update location of seqs in smallRow since they move to smallCol now
                for (int i = 0; i < dMatrix.size(); i++) {
-                       cout << "row = " << i << '\t';
+                       m->mothurOut("row = " + toString(i) + "\t");
                        for (int j = 0; j < dMatrix[i].size(); j++) {
-                               cout << dMatrix[i][j] << '\t';
+                               m->mothurOut(toString(dMatrix[i][j]) + "\t");
                        }
-                       cout << endl;
+                       m->mothurOutEndLine();
                }
        }
        catch(exception& e) {