changed groupfile in classify.seqs to reflect multiple fasta files

[mothur.git] / hcluster.cpp
diff --git a/hcluster.cpp b/hcluster.cpp

index 4e83777751fa50686903758f41dd61fcdd53c52f..9466483c779ce0b32698c8469c6224ea3c63b724 100644 (file)
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -13,10 +13,11 @@
  #include "sparsematrix.hpp"
  
  /***********************************************************************/
  #include "sparsematrix.hpp"
  
  /***********************************************************************/
-
-HCluster::HCluster(RAbundVector* rav, ListVector* lv) :  rabund(rav), list(lv){
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
         try {
         try {
-       
+               m = MothurOut::getInstance();
+               mapWanted = false;
+               exitedBreak = false; 
                 numSeqs = list->getNumSeqs();
                 
                 //initialize cluster array
                 numSeqs = list->getNumSeqs();
                 
                 //initialize cluster array
@@ -25,9 +26,14 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv) :  rabund(rav), list(lv){
                         clusterArray.push_back(temp);
                 }
                 
                         clusterArray.push_back(temp);
                 }
                 
+               if (method != "average") {
+                       openInputFile(distfile, filehandle);
+               }else{  
+                       processFile();  
+               }
         }
         catch(exception& e) {
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "HCluster");
+               m->errorOut(e, "HCluster", "HCluster");
                 exit(1);
         }
  }
                 exit(1);
         }
  }
@@ -44,7 +50,7 @@ void HCluster::clusterBins(){
                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
         }
         catch(exception& e) {
                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "clusterBins");
+               m->errorOut(e, "HCluster", "clusterBins");
                 exit(1);
         }
  
                 exit(1);
         }
  
@@ -56,7 +62,8 @@ void HCluster::clusterBins(){
  void HCluster::clusterNames(){
         try {
                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
  void HCluster::clusterNames(){
         try {
                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
-
+               if (mapWanted) {  updateMap();  }
+               
                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
                 list->set(clusterArray[smallRow].smallChild, "");       
                 list->setLabel(toString(smallDist));
                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
                 list->set(clusterArray[smallRow].smallChild, "");       
                 list->setLabel(toString(smallDist));
@@ -65,7 +72,7 @@ void HCluster::clusterNames(){
  
      }
         catch(exception& e) {
  
      }
         catch(exception& e) {
-               errorOut(e, "HCluster", "clusterNames");
+               m->errorOut(e, "HCluster", "clusterNames");
                 exit(1);
         }
  
                 exit(1);
         }
  
@@ -73,7 +80,6 @@ void HCluster::clusterNames(){
  /***********************************************************************/
  int HCluster::getUpmostParent(int node){
         try {
  /***********************************************************************/
  int HCluster::getUpmostParent(int node){
         try {
-               
                 while (clusterArray[node].parent != -1) {
                         node = clusterArray[node].parent;
                 }
                 while (clusterArray[node].parent != -1) {
                         node = clusterArray[node].parent;
                 }
@@ -81,7 +87,7 @@ int HCluster::getUpmostParent(int node){
                 return node;
         }
         catch(exception& e) {
                 return node;
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "getUpmostParent");
+               m->errorOut(e, "HCluster", "getUpmostParent");
                 exit(1);
         }
  }
                 exit(1);
         }
  }
@@ -90,14 +96,14 @@ void HCluster::printInfo(){
         try {
                 
                 cout << "link table" << endl;
         try {
                 
                 cout << "link table" << endl;
-               for (it = activeLinks.begin(); it!= activeLinks.end(); it++) {
-                       cout << it->first << " = " << it->second << endl;
+               for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
+                       cout << itActive->first << " = " << itActive->second << endl;
                 }
                 cout << endl;
                 for (int i = 0; i < linkTable.size(); i++) {
                         cout << i << '\t';
                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
                 }
                 cout << endl;
                 for (int i = 0; i < linkTable.size(); i++) {
                         cout << i << '\t';
                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
-                               cout << it->first << '-' << it->second << '\t';
+                               cout << it->first << '-' << it->second << '\t' ;
                         }
                         cout << endl;
                 }
                         }
                         cout << endl;
                 }
@@ -111,98 +117,98 @@ void HCluster::printInfo(){
                 
         }
         catch(exception& e) {
                 
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "getUpmostParent");
+               m->errorOut(e, "HCluster", "getUpmostParent");
                 exit(1);
         }
  }
  /***********************************************************************/
  int HCluster::makeActive() {
         try {
                 exit(1);
         }
  }
  /***********************************************************************/
  int HCluster::makeActive() {
         try {
-       
                 int linkValue = 1; 
                 int linkValue = 1; 
-//cout << "active - here" << endl;             
-               it = activeLinks.find(smallRow);
-               it2 = activeLinks.find(smallCol);
+
+               itActive = activeLinks.find(smallRow);
+               it2Active = activeLinks.find(smallCol);
                 
                 
-               if ((it == activeLinks.end()) && (it2 == activeLinks.end())) { //both are not active so add them
+               if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
                         int size = linkTable.size();
                         map<int, int> temp; map<int, int> temp2;
                         
                         //add link to eachother
                         int size = linkTable.size();
                         map<int, int> temp; map<int, int> temp2;
                         
                         //add link to eachother
-                       temp[smallRow] = 1;                                                     //         1    2
-                       temp2[smallCol] = 1;                                            // 1   0        1
-                                                                                                               // 2   1        0
+                       temp[smallRow] = 1;                                                     //         1    2                                                               
+                       temp2[smallCol] = 1;                                            // 1   0        1 
+                                                                                                               // 2   1        0                               
                         linkTable.push_back(temp);
                         linkTable.push_back(temp2);
                         
                         //add to activeLinks
                         activeLinks[smallRow] = size;
                         activeLinks[smallCol] = size+1;
                         linkTable.push_back(temp);
                         linkTable.push_back(temp2);
                         
                         //add to activeLinks
                         activeLinks[smallRow] = size;
                         activeLinks[smallCol] = size+1;
-//cout << "active - here1" << endl;
-               }else if ((it != activeLinks.end()) && (it2 == activeLinks.end())) {  //smallRow is active, smallCol is not
+
+               }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
                          int size = linkTable.size();
                          int size = linkTable.size();
-                        int alreadyActiveRow = it->second;
+                        int alreadyActiveRow = itActive->second;
                          map<int, int> temp; 
                         
                         //add link to eachother
                          map<int, int> temp; 
                         
                         //add link to eachother
-                       temp[smallRow] = 1;                                                     //         6    2       3       5
-                       linkTable.push_back(temp);                                      // 6   0        1       2       0
-                       linkTable[alreadyActiveRow][smallCol] = 1;      // 2   1        0       1       1
-                                                                                                               // 3   2        1       0       0
-                                                                                                               // 5   0    1   0   0   
+                       temp[smallRow] = 1;                                                                     //         6    2       3       5
+                       linkTable.push_back(temp);                                                      // 6   0        1       2       0
+                       linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
+                                                                                                                               // 3   2        1       0       0
+                                                                                                                               // 5   0    1   0   0   
                         //add to activeLinks
                         activeLinks[smallCol] = size;
                         //add to activeLinks
                         activeLinks[smallCol] = size;
-//cout << "active - here2" << endl;                    
-               }else if ((it == activeLinks.end()) && (it2 != activeLinks.end())) {  //smallCol is active, smallRow is not
+       
+               }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
                          int size = linkTable.size();
                          int size = linkTable.size();
-                        int alreadyActiveCol = it2->second;
+                        int alreadyActiveCol = it2Active->second;
                          map<int, int> temp; 
                         
                         //add link to eachother
                          map<int, int> temp; 
                         
                         //add link to eachother
-                       temp[smallCol] = 1;                                                     //         6    2       3       5
-                       linkTable.push_back(temp);                                      // 6   0        1       2       0
-                       linkTable[alreadyActiveCol][smallRow] = 1;      // 2   1        0       1       1
-                                                                                                               // 3   2        1       0       0
-                                                                                                               // 5   0    1   0   0   
+                       temp[smallCol] = 1;                                                                     //         6    2       3       5
+                       linkTable.push_back(temp);                                                      // 6   0        1       2       0
+                       linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
+                                                                                                                               // 3   2        1       0       0
+                                                                                                                               // 5   0    1   0   0   
                         //add to activeLinks
                         activeLinks[smallRow] = size;
                         //add to activeLinks
                         activeLinks[smallRow] = size;
-//cout << "active - here3" << endl;
+
                 }else { //both are active so add one
                 }else { //both are active so add one
-                       int row = it->second;
-                       int col = it2->second;
-//cout << row << '\t' << col << endl;                  
+                       int row = itActive->second;
+                       int col = it2Active->second;
+                       
                         
                         linkTable[row][smallCol]++;
                         linkTable[col][smallRow]++;
                         linkValue = linkTable[row][smallCol];
                         
                         linkTable[row][smallCol]++;
                         linkTable[col][smallRow]++;
                         linkValue = linkTable[row][smallCol];
-//cout << "active - here4" << endl;
                 }
                 
                 return linkValue;
         }
         catch(exception& e) {
                 }
                 
                 return linkValue;
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "makeActive");
+               m->errorOut(e, "HCluster", "makeActive");
                 exit(1);
         }
  }
  /***********************************************************************/
  void HCluster::updateArrayandLinkTable() {
         try {
                 exit(1);
         }
  }
  /***********************************************************************/
  void HCluster::updateArrayandLinkTable() {
         try {
-               //if cluster was made update clusterArray and linkTable
-                       int size = clusterArray.size();
-                       
-                       //add new node
-                       clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
-                       clusterArray.push_back(temp);
-                       
-                       //update child nodes
-                       clusterArray[smallRow].parent = size;
-                       clusterArray[smallCol].parent = size;
+               //if cluster was made update clusterArray and linkTable
+               int size = clusterArray.size();
+               
+               //add new node
+               clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
+               clusterArray.push_back(temp);
+               
+               //update child nodes
+               clusterArray[smallRow].parent = size;
+               clusterArray[smallCol].parent = size;
+               
+               if (method == "furthest") {
                         
                         //update linkTable by merging clustered rows and columns
                         int rowSpot = activeLinks[smallRow];
                         int colSpot = activeLinks[smallCol];
                         
                         //update linkTable by merging clustered rows and columns
                         int rowSpot = activeLinks[smallRow];
                         int colSpot = activeLinks[smallCol];
-       //cout << "here" << endl;               
+                       
                         //fix old rows
                         for (int i = 0; i < linkTable.size(); i++) {
                                 //check if they are in map
                         //fix old rows
                         for (int i = 0; i < linkTable.size(); i++) {
                                 //check if they are in map
@@ -221,8 +227,7 @@ void HCluster::updateArrayandLinkTable() {
                                         linkTable[i].erase(smallRow); //delete col 
                                 }
                         }
                                         linkTable[i].erase(smallRow); //delete col 
                                 }
                         }
-       //printInfo();
-//cout << "here2" << endl;
+                       
                         //merge their values
                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
                         //merge their values
                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
@@ -230,71 +235,556 @@ void HCluster::updateArrayandLinkTable() {
                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
                                         linkTable[colSpot][it->first] = it->second;
                                 }else { //merge them
                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
                                         linkTable[colSpot][it->first] = it->second;
                                 }else { //merge them
-                                       linkTable[colSpot][it->first] = it->second+it2->second;
+                                       linkTable[colSpot][it->first] = it->second + it2->second;
                                 }
                         }
                                 }
                         }
-//cout << "here3" << endl;                     
+                       
                         linkTable[colSpot].erase(size);
                         linkTable[colSpot].erase(size);
-                       linkTable.erase(linkTable.begin()+rowSpot);  //delete col
-       //printInfo();          
+                       linkTable.erase(linkTable.begin()+rowSpot);  //delete row
+                       
                         //update activerows
                         activeLinks.erase(smallRow);
                         activeLinks.erase(smallCol);
                         //update activerows
                         activeLinks.erase(smallRow);
                         activeLinks.erase(smallCol);
-                       
-                       if(rowSpot>colSpot) {   activeLinks[size] = colSpot;    }
-                       else{   activeLinks[size] = colSpot;  }
+                       activeLinks[size] = colSpot;
                         
                         //adjust everybody elses spot since you deleted - time vs. space
                         
                         //adjust everybody elses spot since you deleted - time vs. space
-                       for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
-                               if (it->second > rowSpot) {  activeLinks[it->first]--;  }
+                       for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
+                               if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
                         }
                         }
-                       
-//cout << "here4" << endl;
-       
+               }
         }
         catch(exception& e) {
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "updateArrayandLinkTable");
+               m->errorOut(e, "HCluster", "updateArrayandLinkTable");
                 exit(1);
         }
  }
  /***********************************************************************/
  bool HCluster::update(int row, int col, float distance){
         try {
                 exit(1);
         }
  }
  /***********************************************************************/
  bool HCluster::update(int row, int col, float distance){
         try {
-               
+               bool cluster = false;
                 smallRow = row;
                 smallCol = col;
                 smallDist = distance;
                 smallRow = row;
                 smallCol = col;
                 smallDist = distance;
-               bool clustered = false;
                 
                 //find upmost parent of row and col
                 smallRow = getUpmostParent(smallRow);
                 smallCol = getUpmostParent(smallCol);
                 
                 //find upmost parent of row and col
                 smallRow = getUpmostParent(smallRow);
                 smallCol = getUpmostParent(smallCol);
-       //cout << "row = " << row << " smallRow = " << smallRow <<  " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
-               
-               //are they active in the link table
-               int linkValue = makeActive(); //after this point this nodes info is active in linkTable
-       //printInfo();                  
-//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
-               //can we cluster???
-               if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {
-                       //printInfo();
-                       updateArrayandLinkTable();
-                       clusterBins();
-                       clusterNames();
-                       clustered = true;
-                       //printInfo();
+
+               //you don't want to cluster with yourself
+               if (smallRow != smallCol) {
+                       
+                       if (method != "average") {
+                               //can we cluster???
+                               if (method == "nearest") { cluster = true;  }
+                               else{ //assume furthest
+                                       //are they active in the link table
+                                       int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+                                       if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
+                               }
+                               
+                               if (cluster) { 
+                                       updateArrayandLinkTable();
+                                       clusterBins();
+                                       clusterNames();
+                               }
+                       }else {
+                               cluster = true;
+                               updateArrayandLinkTable();
+                               clusterBins();
+                               clusterNames();
+                               combineFile();
+                       }
                 }
                 
                 }
                 
+               return cluster;
                 //printInfo();
                 //printInfo();
-               return clustered;
         }
         catch(exception& e) {
         }
         catch(exception& e) {
-               errorOut(e, "HCluster", "update");
+               m->errorOut(e, "HCluster", "update");
                 exit(1);
         }
  }
                 exit(1);
         }
  }
+/***********************************************************************/
+void HCluster::setMapWanted(bool ms)  {  
+       try {
+               mapWanted = ms;
+               
+               //initialize map
+               for (int i = 0; i < list->getNumBins(); i++) {
+                       
+                       //parse bin 
+                       string names = list->get(i);
+                       while (names.find_first_of(',') != -1) { 
+                               //get name from bin
+                               string name = names.substr(0,names.find_first_of(','));
+                               //save name and bin number
+                               seq2Bin[name] = i;
+                               names = names.substr(names.find_first_of(',')+1, names.length());
+                       }
+                       
+                       //get last name
+                       seq2Bin[names] = i;
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "setMapWanted");
+               exit(1);
+       }
+}
+/***********************************************************************/
+void HCluster::updateMap() {
+try {
+               //update location of seqs in smallRow since they move to smallCol now
+               string names = list->get(clusterArray[smallRow].smallChild);
+               while (names.find_first_of(',') != -1) { 
+                       //get name from bin
+                       string name = names.substr(0,names.find_first_of(','));
+                       //save name and bin number
+                       seq2Bin[name] = clusterArray[smallCol].smallChild;
+                       names = names.substr(names.find_first_of(',')+1, names.length());
+               }
+                       
+               //get last name
+               seq2Bin[names] = clusterArray[smallCol].smallChild;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "updateMap");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqs(){
+       try {
+               vector<seqDist> sameSeqs;
+               
+               if(method != "average") {
+                       sameSeqs = getSeqsFNNN();
+               }else{
+                       sameSeqs = getSeqsAN(); 
+               }
+                               
+               return sameSeqs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "getSeqs");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqsFNNN(){
+       try {
+               string firstName, secondName;
+               float distance, prevDistance;
+               vector<seqDist> sameSeqs;
+               prevDistance = -1;
+       
+               //if you are not at the beginning of the file
+               if (exitedBreak) { 
+                       sameSeqs.push_back(next);
+                       prevDistance = next.dist;
+                       exitedBreak = false;
+               }
+       
+               //get entry
+               while (!filehandle.eof()) {
+                       
+                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+       
+                       //save first one
+                       if (prevDistance == -1) { prevDistance = distance; }
+                       
+                       map<string,int>::iterator itA = nameMap->find(firstName);
+                       map<string,int>::iterator itB = nameMap->find(secondName);
+                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+               
+                       //using cutoff
+                       if (distance > cutoff) { break; }
+               
+                       if (distance != -1) { //-1 means skip me
+                       
+                               //are the distances the same
+                               if (distance == prevDistance) { //save in vector
+                                       seqDist temp(itA->second, itB->second, distance);
+                                       sameSeqs.push_back(temp);
+                                       exitedBreak = false;
+                               }else{ 
+                                       next.seq1 = itA->second;
+                                       next.seq2 = itB->second;
+                                       next.dist = distance;
+                                       exitedBreak = true;
+                                       break;
+                               }
+                       }
+               }
+               
+               //rndomize matching dists
+               random_shuffle(sameSeqs.begin(), sameSeqs.end());
+               
+               return sameSeqs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "getSeqsFNNN");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
+vector<seqDist> HCluster::getSeqsAN(){
+       try {
+               int firstName, secondName;
+               float prevDistance;
+               vector<seqDist> sameSeqs;
+               prevDistance = -1;
+               
+               openInputFile(distfile, filehandle, "no error"); 
+               
+               //is the smallest value in mergedMin or the distfile?
+               float mergedMinDist = 10000;
+               float distance = 10000;
+               if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
+                       
+               if (!filehandle.eof()) {  
+                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
+                       //save first one
+                       if (prevDistance == -1) { prevDistance = distance; } 
+                       if (distance != -1) { //-1 means skip me
+                               seqDist temp(firstName, secondName, distance);
+                               sameSeqs.push_back(temp);
+                       }else{ distance = 10000; }
+               }
+               
+               if (mergedMinDist < distance) { //get minimum distance from mergedMin
+                       //remove distance we saved from file
+                       sameSeqs.clear();
+                       prevDistance = mergedMinDist;
+                       
+                       for (int i = 0; i < mergedMin.size(); i++) {
+                               if (mergedMin[i].dist == prevDistance) {
+                                       sameSeqs.push_back(mergedMin[i]);
+                               }else { break; }
+                       }
+               }else{ //get minimum from file
+                       //get entry
+                       while (!filehandle.eof()) {
+                               
+                               filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+                               
+                               if (prevDistance == -1) { prevDistance = distance; }
+                               
+                               if (distance != -1) { //-1 means skip me
+                                       //are the distances the same
+                                       if (distance == prevDistance) { //save in vector
+                                               seqDist temp(firstName, secondName, distance);
+                                               sameSeqs.push_back(temp);
+                                       }else{  
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               filehandle.close();
+               
+               //randomize matching dists
+               random_shuffle(sameSeqs.begin(), sameSeqs.end());
+               
+               //can only return one value since once these are merged the other distances in sameSeqs may have changed
+               vector<seqDist> temp;
+               if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
+               
+               return temp;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "getSeqsAN");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+int HCluster::combineFile() {
+       try {
+               //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
+               //char* inputBuffer;
+               //inputBuffer = new char[bufferSize];
+               //size_t numRead;
+               
+               string tempDistFile = distfile + ".temp";
+               ofstream out;
+               openOutputFile(tempDistFile, out);
+               
+               //FILE* in;
+               //in = fopen(distfile.c_str(), "rb");
+       
+               ifstream in;
+               openInputFile(distfile, in);
+               
+               int first, second;
+               float dist;
+               
+               vector< map<int, float> > smallRowColValues;
+               smallRowColValues.resize(2);  //0 = row, 1 = col
+               int count = 0;
+                               
+               //go through file pulling out distances related to rows merging
+               //if mergedMin contains distances add those back into file
+               //bool done = false;
+               //partialDist = "";
+               //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+//cout << "number of char read = " << numRead << endl;
+//cout << inputBuffer << endl;
+                       //if (numRead < bufferSize) { done = true; }
+                       
+                       //parse input into individual distances
+                       //int spot = 0;
+                       //string outputString = "";
+                       //while(spot < numRead) {
+       //cout << "spot = " << spot << endl;
+                        //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+                          
+                          //you read a partial distance
+                         // if (nextDist.seq1 == -1) { break;  }
+                       while (!in.eof()) {
+                               //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+       //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
+                          //since file is sorted and mergedMin is sorted 
+                          //you can put the smallest distance from each through the code below and keep the file sorted
+                          
+                          in >> first >> second >> dist; gobble(in);
+                          
+                          if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
+                          
+                          //while there are still values in mergedMin that are smaller than the distance read from file
+                          while (count < mergedMin.size())  {
+                          
+                                       //is the distance in mergedMin smaller than from the file
+                                       if (mergedMin[count].dist < dist) {
+                                       //is this a distance related to the columns merging?
+                                       //if yes, save in memory
+                                               if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+                                               }else if (mergedMin[count].seq1 == smallCol) {
+                                                       smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+                                               }else if (mergedMin[count].seq2 == smallCol) {
+                                                       smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+                                               }else if (mergedMin[count].seq1 == smallRow) {
+                                                       smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+                                               }else if (mergedMin[count].seq2 == smallRow) {
+                                                       smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+                                               }else { //if no, write to temp file
+                                                       //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+                                                       out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+                                               }
+                                               count++;
+                                       }else{   break; }
+                          }
+                          
+                          //is this a distance related to the columns merging?
+                          //if yes, save in memory
+                          if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
+                          }else if (first == smallCol) {
+                                       smallRowColValues[1][second] = dist;
+                          }else if (second == smallCol) {
+                                       smallRowColValues[1][first] = dist;
+                          }else if (first == smallRow) {
+                                       smallRowColValues[0][second] = dist;
+                          }else if (second == smallRow) {
+                                       smallRowColValues[0][first] = dist;
+                          
+                          }else { //if no, write to temp file
+                                       //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+                                       out << first << '\t' << second << '\t' << dist << endl;
+                          }
+                       }
+                       
+                       //out << outputString;
+                       //if(done) { break; }
+               //}
+               //fclose(in);
+               in.close();
+               
+               //if values in mergedMin are larger than the the largest in file then
+               while (count < mergedMin.size())  {  
+                       //is this a distance related to the columns merging?
+                       //if yes, save in memory
+                       if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+                       }else if (mergedMin[count].seq1 == smallCol) {
+                               smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+                       }else if (mergedMin[count].seq2 == smallCol) {
+                               smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+                       }else if (mergedMin[count].seq1 == smallRow) {
+                               smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+                       }else if (mergedMin[count].seq2 == smallRow) {
+                               smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+                               
+                       }else { //if no, write to temp file
+                               out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+                       }
+                       count++;
+               }
+               out.close();
+               mergedMin.clear();
+                       
+               //rename tempfile to distfile
+               remove(distfile.c_str());
+               rename(tempDistFile.c_str(), distfile.c_str());
+//cout << "remove = "<< renameOK << " rename = " << ok << endl;        
  
  
+               //merge clustered rows averaging the distances
+               map<int, float>::iterator itMerge;
+               map<int, float>::iterator it2Merge;
+               for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
+                       //does smallRowColValues[1] have a distance to this seq too?
+                       it2Merge = smallRowColValues[1].find(itMerge->first);
+                       
+                       float average;
+                       if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
+                               //weighted average
+                               int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+                               average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+                               smallRowColValues[1].erase(it2Merge);
+                               
+                               seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
+                               mergedMin.push_back(temp);
+                       }
+               }
  
  
+               //sort merged values
+               sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "combineFile");
+               exit(1);
+       }
+}
+/***********************************************************************
+seqDist HCluster::getNextDist(char* buffer, int& index, int size){
+       try {
+               seqDist next;
+               int indexBefore = index;
+               string first, second, distance;
+               first = ""; second = ""; distance = "";
+               int tabCount = 0;
+//cout << "partial = " << partialDist << endl;         
+               if (partialDist != "") { //read what you can, you know it is less than a whole distance.
+                       for (int i = 0; i < partialDist.size(); i++) {
+                               if (tabCount == 0) {
+                                       if (partialDist[i] == '\t') { tabCount++; }
+                                       else {  first += partialDist[i];        }
+                               }else if (tabCount == 1) {
+                                       if (partialDist[i] == '\t') { tabCount++; }
+                                       else {  second += partialDist[i];       }
+                               }else if (tabCount == 2) {
+                                       distance +=  partialDist[i];
+                               }
+                       }
+                       partialDist = "";
+               }
+       
+               //try to get another distance
+               bool gotDist = false;
+               while (index < size) {
+                       if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
+                               gotDist = true;
+                               
+                               //gobble space
+                               while (index < size) {          
+                                       if (isspace(buffer[index])) { index++; }
+                                       else { break; }         
+                               }
+                               break;
+                       }else{
+                               if (tabCount == 0) {
+                                       if (buffer[index] == '\t') { tabCount++; }
+                                       else {  first += buffer[index]; }
+                               }else if (tabCount == 1) {
+                                       if (buffer[index] == '\t') { tabCount++; }
+                                       else {  second += buffer[index];        }
+                               }else if (tabCount == 2) {
+                                       distance +=  buffer[index];
+                               }
+                               index++;
+                       }
+               }
+               
+               //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
+               //then you want to save the partial distance.
+               if (!gotDist) {
+                       for (int i = indexBefore; i < size; i++) {
+                               partialDist += buffer[i];
+                       }
+                       index = size + 1;
+                       next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
+               }else{
+                       int firstname, secondname;
+                       float dist;
+                       
+                       convert(first, firstname);
+                       convert(second, secondname);
+                       convert(distance, dist);
+                       
+                       next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
+               }
+                                               
+               return next;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "getNextDist");
+               exit(1);
+       }
+}
+/***********************************************************************/
+int HCluster::processFile() {
+       try {
+               string firstName, secondName;
+               float distance;
+               
+               ifstream in;
+               openInputFile(distfile, in);
+               
+               ofstream out;
+               string outTemp = distfile + ".temp";
+               openOutputFile(outTemp, out);
+       
+               //get entry
+               while (!in.eof()) {
+                       if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; }
+                       
+                       in >> firstName >> secondName >> distance;    gobble(in);               
+                       
+                       map<string,int>::iterator itA = nameMap->find(firstName);
+                       map<string,int>::iterator itB = nameMap->find(secondName);
+                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+               
+                       //using cutoff
+                       if (distance > cutoff) { break; }
+               
+                       if (distance != -1) { //-1 means skip me
+                               out << itA->second << '\t' << itB->second << '\t' << distance << endl;
+                       }
+               }
+               
+               in.close();
+               out.close();
+               
+               remove(distfile.c_str());
+               rename(outTemp.c_str(), distfile.c_str());
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "HCluster", "processFile");
+               exit(1);
+       }
+}
  /***********************************************************************/
  
  
  /***********************************************************************/
  
  
+
+
+
+
+
+