]> git.donarmstrong.com Git - mothur.git/blobdiff - hcluster.cpp
changes while testing
[mothur.git] / hcluster.cpp
index 259036f0f02a42beb95926f9ce59a4140164840e..f8f48095b2ec55a8dce91d55881271f2f1837218 100644 (file)
 #include "hcluster.h"
 #include "rabundvector.hpp"
 #include "listvector.hpp"
-#include "sparsematrix.hpp"
 
 /***********************************************************************/
-HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
        try {
+               m = MothurOut::getInstance();
                mapWanted = false;
                exitedBreak = false; 
                numSeqs = list->getNumSeqs();
@@ -25,14 +25,14 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAs
                        clusterArray.push_back(temp);
                }
                
-               if (method != "average") {
-                       openInputFile(distfile, filehandle);
+               if ((method == "furthest") || (method == "nearest")) {
+                       m->openInputFile(distfile, filehandle);
                }else{  
                        processFile();  
                }
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "HCluster");
+               m->errorOut(e, "HCluster", "HCluster");
                exit(1);
        }
 }
@@ -49,7 +49,7 @@ void HCluster::clusterBins(){
                //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "clusterBins");
+               m->errorOut(e, "HCluster", "clusterBins");
                exit(1);
        }
 
@@ -71,7 +71,7 @@ void HCluster::clusterNames(){
 
     }
        catch(exception& e) {
-               errorOut(e, "HCluster", "clusterNames");
+               m->errorOut(e, "HCluster", "clusterNames");
                exit(1);
        }
 
@@ -86,7 +86,7 @@ int HCluster::getUpmostParent(int node){
                return node;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getUpmostParent");
+               m->errorOut(e, "HCluster", "getUpmostParent");
                exit(1);
        }
 }
@@ -116,7 +116,7 @@ void HCluster::printInfo(){
                
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getUpmostParent");
+               m->errorOut(e, "HCluster", "getUpmostParent");
                exit(1);
        }
 }
@@ -184,7 +184,7 @@ int HCluster::makeActive() {
                return linkValue;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "makeActive");
+               m->errorOut(e, "HCluster", "makeActive");
                exit(1);
        }
 }
@@ -253,12 +253,12 @@ void HCluster::updateArrayandLinkTable() {
                }
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "updateArrayandLinkTable");
+               m->errorOut(e, "HCluster", "updateArrayandLinkTable");
                exit(1);
        }
 }
 /***********************************************************************/
-bool HCluster::update(int row, int col, float distance){
+double HCluster::update(int row, int col, float distance){
        try {
                bool cluster = false;
                smallRow = row;
@@ -272,7 +272,7 @@ bool HCluster::update(int row, int col, float distance){
                //you don't want to cluster with yourself
                if (smallRow != smallCol) {
                        
-                       if (method != "average") {
+                       if ((method == "furthest") || (method == "nearest")) {
                                //can we cluster???
                                if (method == "nearest") { cluster = true;  }
                                else{ //assume furthest
@@ -295,18 +295,18 @@ bool HCluster::update(int row, int col, float distance){
                        }
                }
                
-               return cluster;
+               return cutoff;
                //printInfo();
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "update");
+               m->errorOut(e, "HCluster", "update");
                exit(1);
        }
 }
 /***********************************************************************/
-void HCluster::setMapWanted(bool m)  {  
+void HCluster::setMapWanted(bool ms)  {  
        try {
-               mapWanted = m;
+               mapWanted = ms;
                
                //initialize map
                for (int i = 0; i < list->getNumBins(); i++) {
@@ -327,7 +327,7 @@ void HCluster::setMapWanted(bool m)  {
                
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "setMapWanted");
+               m->errorOut(e, "HCluster", "setMapWanted");
                exit(1);
        }
 }
@@ -348,7 +348,7 @@ try {
                seq2Bin[names] = clusterArray[smallCol].smallChild;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "updateMap");
+               m->errorOut(e, "HCluster", "updateMap");
                exit(1);
        }
 }
@@ -357,7 +357,7 @@ vector<seqDist> HCluster::getSeqs(){
        try {
                vector<seqDist> sameSeqs;
                
-               if(method != "average") {
+               if ((method == "furthest") || (method == "nearest")) {
                        sameSeqs = getSeqsFNNN();
                }else{
                        sameSeqs = getSeqsAN(); 
@@ -366,7 +366,7 @@ vector<seqDist> HCluster::getSeqs(){
                return sameSeqs;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getSeqs");
+               m->errorOut(e, "HCluster", "getSeqs");
                exit(1);
        }
 }
@@ -388,15 +388,15 @@ vector<seqDist> HCluster::getSeqsFNNN(){
                //get entry
                while (!filehandle.eof()) {
                        
-                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+                       filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
        
                        //save first one
                        if (prevDistance == -1) { prevDistance = distance; }
                        
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                
                        //using cutoff
                        if (distance > cutoff) { break; }
@@ -424,12 +424,11 @@ vector<seqDist> HCluster::getSeqsFNNN(){
                return sameSeqs;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getSeqsFNNN");
+               m->errorOut(e, "HCluster", "getSeqsFNNN");
                exit(1);
        }
 }
 //**********************************************************************************************************************
-//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
 vector<seqDist> HCluster::getSeqsAN(){
        try {
                int firstName, secondName;
@@ -437,7 +436,7 @@ vector<seqDist> HCluster::getSeqsAN(){
                vector<seqDist> sameSeqs;
                prevDistance = -1;
                
-               openInputFile(distfile, filehandle, "no error"); 
+               m->openInputFile(distfile, filehandle, "no error"); 
                
                //is the smallest value in mergedMin or the distfile?
                float mergedMinDist = 10000;
@@ -445,13 +444,13 @@ vector<seqDist> HCluster::getSeqsAN(){
                if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
                        
                if (!filehandle.eof()) {  
-                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
+                       filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
                        //save first one
                        if (prevDistance == -1) { prevDistance = distance; } 
                        if (distance != -1) { //-1 means skip me
                                seqDist temp(firstName, secondName, distance);
                                sameSeqs.push_back(temp);
-                       }
+                       }else{ distance = 10000; }
                }
                
                if (mergedMinDist < distance) { //get minimum distance from mergedMin
@@ -468,7 +467,7 @@ vector<seqDist> HCluster::getSeqsAN(){
                        //get entry
                        while (!filehandle.eof()) {
                                
-                               filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+                               filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
                                
                                if (prevDistance == -1) { prevDistance = distance; }
                                
@@ -495,26 +494,29 @@ vector<seqDist> HCluster::getSeqsAN(){
                return temp;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getSeqsAN");
+               m->errorOut(e, "HCluster", "getSeqsAN");
                exit(1);
        }
 }
 
 /***********************************************************************/
-void HCluster::combineFile() {
+int HCluster::combineFile() {
        try {
-               int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
-               char* inputBuffer;
-               inputBuffer = new char[bufferSize];
-               size_t numRead;
+               //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
+               //char* inputBuffer;
+               //inputBuffer = new char[bufferSize];
+               //size_t numRead;
                
                string tempDistFile = distfile + ".temp";
                ofstream out;
-               openOutputFile(tempDistFile, out);
+               m->openOutputFile(tempDistFile, out);
                
-               FILE* in;
-               in = fopen(distfile.c_str(), "rb");
+               //FILE* in;
+               //in = fopen(distfile.c_str(), "rb");
        
+               ifstream in;
+               m->openInputFile(distfile, in, "no error");
+               
                int first, second;
                float dist;
                
@@ -524,28 +526,32 @@ void HCluster::combineFile() {
                                
                //go through file pulling out distances related to rows merging
                //if mergedMin contains distances add those back into file
-               bool done = false;
-               partialDist = "";
-               while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+               //bool done = false;
+               //partialDist = "";
+               //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 //cout << "number of char read = " << numRead << endl;
 //cout << inputBuffer << endl;
-                       if (numRead < bufferSize) { done = true; }
+                       //if (numRead < bufferSize) { done = true; }
                        
                        //parse input into individual distances
-                       int spot = 0;
-                       string outputString = "";
-                       while(spot < numRead) {
+                       //int spot = 0;
+                       //string outputString = "";
+                       //while(spot < numRead) {
        //cout << "spot = " << spot << endl;
-                          seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+                        //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
                           
                           //you read a partial distance
-                          if (nextDist.seq1 == -1) { break;  }
-                          
-                          first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+                         // if (nextDist.seq1 == -1) { break;  }
+                       while (!in.eof()) {
+                               //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
        //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
                           //since file is sorted and mergedMin is sorted 
                           //you can put the smallest distance from each through the code below and keep the file sorted
                           
+                          in >> first >> second >> dist; m->gobble(in);
+                          
+                          if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
+                          
                           //while there are still values in mergedMin that are smaller than the distance read from file
                           while (count < mergedMin.size())  {
                           
@@ -563,7 +569,10 @@ void HCluster::combineFile() {
                                                }else if (mergedMin[count].seq2 == smallRow) {
                                                        smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
                                                }else { //if no, write to temp file
-                                                       outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+                                                       //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+                                                       //if (mergedMin[count].dist < cutoff) { 
+                                                               out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+                                                       //}
                                                }
                                                count++;
                                        }else{   break; }
@@ -582,14 +591,18 @@ void HCluster::combineFile() {
                                        smallRowColValues[0][first] = dist;
                           
                           }else { //if no, write to temp file
-                                       outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+                                       //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+                                  //if (dist < cutoff) {
+                                          out << first << '\t' << second << '\t' << dist << endl;
+                                  //}
                           }
                        }
                        
-                       out << outputString;
-                       if(done) { break; }
-               }
-               fclose(in);
+                       //out << outputString;
+                       //if(done) { break; }
+               //}
+               //fclose(in);
+               in.close();
                
                //if values in mergedMin are larger than the the largest in file then
                while (count < mergedMin.size())  {  
@@ -606,7 +619,9 @@ void HCluster::combineFile() {
                                smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
                                
                        }else { //if no, write to temp file
-                               out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+                               //if (mergedMin[count].dist < cutoff) {
+                                       out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+                               //}
                        }
                        count++;
                }
@@ -614,9 +629,10 @@ void HCluster::combineFile() {
                mergedMin.clear();
                        
                //rename tempfile to distfile
-               remove(distfile.c_str());
+               m->mothurRemove(distfile);
                rename(tempDistFile.c_str(), distfile.c_str());
-               
+//cout << "remove = "<< renameOK << " rename = " << ok << endl;        
+
                //merge clustered rows averaging the distances
                map<int, float>::iterator itMerge;
                map<int, float>::iterator it2Merge;
@@ -626,25 +642,40 @@ void HCluster::combineFile() {
                        
                        float average;
                        if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
-                               //weighted average
-                               int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
-                               average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+                               //average
+                               if (method == "average") {
+                                       int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+                                       average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+                               }else { //weighted
+                                       average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;                           
+                               }
+                               
                                smallRowColValues[1].erase(it2Merge);
                                
                                seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
                                mergedMin.push_back(temp);
+                       }else {  
+                               //can't find value so update cutoff
+                               if (cutoff > itMerge->second) { cutoff = itMerge->second; }
                        }
                }
-
+               
+               //update cutoff
+               for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) { 
+                       if (cutoff > itMerge->second) { cutoff = itMerge->second; }
+               }
+               
                //sort merged values
                sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
+               
+               return 0;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "combineFile");
+               m->errorOut(e, "HCluster", "combineFile");
                exit(1);
        }
 }
-/***********************************************************************/
+/***********************************************************************
 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
        try {
                seqDist next;
@@ -674,7 +705,7 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){
                        if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
                                gotDist = true;
                                
-                               //gobble space
+                               //m->gobble space
                                while (index < size) {          
                                        if (isspace(buffer[index])) { index++; }
                                        else { break; }         
@@ -716,32 +747,33 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){
                return next;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "getNextDist");
+               m->errorOut(e, "HCluster", "getNextDist");
                exit(1);
        }
 }
-/***********************************************************************/
-void HCluster::processFile() {
+***********************************************************************/
+int HCluster::processFile() {
        try {
                string firstName, secondName;
                float distance;
                
                ifstream in;
-               openInputFile(distfile, in);
+               m->openInputFile(distfile, in, "no error");
                
                ofstream out;
                string outTemp = distfile + ".temp";
-               openOutputFile(outTemp, out);
+               m->openOutputFile(outTemp, out);
        
                //get entry
                while (!in.eof()) {
+                       if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
                        
-                       in >> firstName >> secondName >> distance;    gobble(in);               
+                       in >> firstName >> secondName >> distance;    m->gobble(in);            
                        
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                
                        //using cutoff
                        if (distance > cutoff) { break; }
@@ -754,11 +786,13 @@ void HCluster::processFile() {
                in.close();
                out.close();
                
-               remove(distfile.c_str());
+               m->mothurRemove(distfile);
                rename(outTemp.c_str(), distfile.c_str());
+               
+               return 0;
        }
        catch(exception& e) {
-               errorOut(e, "HCluster", "processFile");
+               m->errorOut(e, "HCluster", "processFile");
                exit(1);
        }
 }