hcluster.cpp

   1 /*
   2  *  hcluster.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 10/13/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "hcluster.h"
  11 #include "rabundvector.hpp"
  12 #include "listvector.hpp"
  13 #include "sparsematrix.hpp"
  14
  15 /***********************************************************************/
  16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
  17         try {
  18                 mapWanted = false;
  19                 exitedBreak = false;
  20                 numSeqs = list->getNumSeqs();
  21
  22                 //initialize cluster array
  23                 for (int i = 0; i < numSeqs; i++) {
  24                         clusterNode temp(1, -1, i);
  25                         clusterArray.push_back(temp);
  26                 }
  27
  28                 if (method != "average") {
  29                         openInputFile(distfile, filehandle);
  30                 }else{
  31                         processFile();
  32                 }
  33         }
  34         catch(exception& e) {
  35                 errorOut(e, "HCluster", "HCluster");
  36                 exit(1);
  37         }
  38 }
  39 /***********************************************************************/
  40
  41 void HCluster::clusterBins(){
  42         try {
  43                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
  44
  45                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
  46                 rabund->set(clusterArray[smallRow].smallChild, 0);
  47                 rabund->setLabel(toString(smallDist));
  48
  49                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
  50         }
  51         catch(exception& e) {
  52                 errorOut(e, "HCluster", "clusterBins");
  53                 exit(1);
  54         }
  55
  56
  57 }
  58
  59 /***********************************************************************/
  60
  61 void HCluster::clusterNames(){
  62         try {
  63                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
  64                 if (mapWanted) {  updateMap();  }
  65
  66                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
  67                 list->set(clusterArray[smallRow].smallChild, "");
  68                 list->setLabel(toString(smallDist));
  69
  70                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
  71
  72     }
  73         catch(exception& e) {
  74                 errorOut(e, "HCluster", "clusterNames");
  75                 exit(1);
  76         }
  77
  78 }
  79 /***********************************************************************/
  80 int HCluster::getUpmostParent(int node){
  81         try {
  82                 while (clusterArray[node].parent != -1) {
  83                         node = clusterArray[node].parent;
  84                 }
  85
  86                 return node;
  87         }
  88         catch(exception& e) {
  89                 errorOut(e, "HCluster", "getUpmostParent");
  90                 exit(1);
  91         }
  92 }
  93 /***********************************************************************/
  94 void HCluster::printInfo(){
  95         try {
  96
  97                 cout << "link table" << endl;
  98                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
  99                         cout << itActive->first << " = " << itActive->second << endl;
 100                 }
 101                 cout << endl;
 102                 for (int i = 0; i < linkTable.size(); i++) {
 103                         cout << i << '\t';
 104                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
 105                                 cout << it->first << '-' << it->second << '\t' ;
 106                         }
 107                         cout << endl;
 108                 }
 109                 cout << endl << "clusterArray" << endl;
 110
 111                 for (int i = 0; i < clusterArray.size(); i++) {
 112                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
 113                 }
 114                 cout << endl;
 115
 116
 117         }
 118         catch(exception& e) {
 119                 errorOut(e, "HCluster", "getUpmostParent");
 120                 exit(1);
 121         }
 122 }
 123 /***********************************************************************/
 124 int HCluster::makeActive() {
 125         try {
 126                 int linkValue = 1;
 127
 128                 itActive = activeLinks.find(smallRow);
 129                 it2Active = activeLinks.find(smallCol);
 130
 131                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
 132                         int size = linkTable.size();
 133                         map<int, int> temp; map<int, int> temp2;
 134
 135                         //add link to eachother
 136                         temp[smallRow] = 1;                                                     //         1    2
 137                         temp2[smallCol] = 1;                                            // 1   0        1
 138                                                                                                                 // 2   1        0
 139                         linkTable.push_back(temp);
 140                         linkTable.push_back(temp2);
 141
 142                         //add to activeLinks
 143                         activeLinks[smallRow] = size;
 144                         activeLinks[smallCol] = size+1;
 145
 146                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
 147                          int size = linkTable.size();
 148                          int alreadyActiveRow = itActive->second;
 149                          map<int, int> temp;
 150
 151                         //add link to eachother
 152                         temp[smallRow] = 1;                                                                     //         6    2       3       5
 153                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 154                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
 155                                                                                                                                 // 3   2        1       0       0
 156                                                                                                                                 // 5   0    1   0   0
 157                         //add to activeLinks
 158                         activeLinks[smallCol] = size;
 159
 160                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
 161                          int size = linkTable.size();
 162                          int alreadyActiveCol = it2Active->second;
 163                          map<int, int> temp;
 164
 165                         //add link to eachother
 166                         temp[smallCol] = 1;                                                                     //         6    2       3       5
 167                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 168                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
 169                                                                                                                                 // 3   2        1       0       0
 170                                                                                                                                 // 5   0    1   0   0
 171                         //add to activeLinks
 172                         activeLinks[smallRow] = size;
 173
 174                 }else { //both are active so add one
 175                         int row = itActive->second;
 176                         int col = it2Active->second;
 177
 178
 179                         linkTable[row][smallCol]++;
 180                         linkTable[col][smallRow]++;
 181                         linkValue = linkTable[row][smallCol];
 182                 }
 183
 184                 return linkValue;
 185         }
 186         catch(exception& e) {
 187                 errorOut(e, "HCluster", "makeActive");
 188                 exit(1);
 189         }
 190 }
 191 /***********************************************************************/
 192 void HCluster::updateArrayandLinkTable() {
 193         try {
 194                 //if cluster was made update clusterArray and linkTable
 195                 int size = clusterArray.size();
 196
 197                 //add new node
 198                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
 199                 clusterArray.push_back(temp);
 200
 201                 //update child nodes
 202                 clusterArray[smallRow].parent = size;
 203                 clusterArray[smallCol].parent = size;
 204
 205                 if (method == "furthest") {
 206
 207                         //update linkTable by merging clustered rows and columns
 208                         int rowSpot = activeLinks[smallRow];
 209                         int colSpot = activeLinks[smallCol];
 210
 211                         //fix old rows
 212                         for (int i = 0; i < linkTable.size(); i++) {
 213                                 //check if they are in map
 214                                 it = linkTable[i].find(smallRow);
 215                                 it2 = linkTable[i].find(smallCol);
 216
 217                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
 218                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
 219                                         linkTable[i].erase(smallCol); //delete col row
 220                                         linkTable[i].erase(smallRow); //delete col row
 221                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
 222                                         linkTable[i][size] = linkTable[i][smallCol];
 223                                         linkTable[i].erase(smallCol); //delete col
 224                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
 225                                         linkTable[i][size] = linkTable[i][smallRow];
 226                                         linkTable[i].erase(smallRow); //delete col
 227                                 }
 228                         }
 229
 230                         //merge their values
 231                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
 232                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
 233
 234                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
 235                                         linkTable[colSpot][it->first] = it->second;
 236                                 }else { //merge them
 237                                         linkTable[colSpot][it->first] = it->second + it2->second;
 238                                 }
 239                         }
 240
 241                         linkTable[colSpot].erase(size);
 242                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
 243
 244                         //update activerows
 245                         activeLinks.erase(smallRow);
 246                         activeLinks.erase(smallCol);
 247                         activeLinks[size] = colSpot;
 248
 249                         //adjust everybody elses spot since you deleted - time vs. space
 250                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
 251                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
 252                         }
 253                 }
 254         }
 255         catch(exception& e) {
 256                 errorOut(e, "HCluster", "updateArrayandLinkTable");
 257                 exit(1);
 258         }
 259 }
 260 /***********************************************************************/
 261 bool HCluster::update(int row, int col, float distance){
 262         try {
 263                 bool cluster = false;
 264                 smallRow = row;
 265                 smallCol = col;
 266                 smallDist = distance;
 267
 268                 //find upmost parent of row and col
 269                 smallRow = getUpmostParent(smallRow);
 270                 smallCol = getUpmostParent(smallCol);
 271
 272                 //you don't want to cluster with yourself
 273                 if (smallRow != smallCol) {
 274
 275                         if (method != "average") {
 276                                 //can we cluster???
 277                                 if (method == "nearest") { cluster = true;  }
 278                                 else{ //assume furthest
 279                                         //are they active in the link table
 280                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
 281                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
 282                                 }
 283
 284                                 if (cluster) {
 285                                         updateArrayandLinkTable();
 286                                         clusterBins();
 287                                         clusterNames();
 288                                 }
 289                         }else {
 290                                 cluster = true;
 291                                 updateArrayandLinkTable();
 292                                 clusterBins();
 293                                 clusterNames();
 294                                 combineFile();
 295                         }
 296                 }
 297
 298                 return cluster;
 299                 //printInfo();
 300         }
 301         catch(exception& e) {
 302                 errorOut(e, "HCluster", "update");
 303                 exit(1);
 304         }
 305 }
 306 /***********************************************************************/
 307 void HCluster::setMapWanted(bool m)  {
 308         try {
 309                 mapWanted = m;
 310
 311                 //initialize map
 312                 for (int i = 0; i < list->getNumBins(); i++) {
 313
 314                         //parse bin
 315                         string names = list->get(i);
 316                         while (names.find_first_of(',') != -1) {
 317                                 //get name from bin
 318                                 string name = names.substr(0,names.find_first_of(','));
 319                                 //save name and bin number
 320                                 seq2Bin[name] = i;
 321                                 names = names.substr(names.find_first_of(',')+1, names.length());
 322                         }
 323
 324                         //get last name
 325                         seq2Bin[names] = i;
 326                 }
 327
 328         }
 329         catch(exception& e) {
 330                 errorOut(e, "HCluster", "setMapWanted");
 331                 exit(1);
 332         }
 333 }
 334 /***********************************************************************/
 335 void HCluster::updateMap() {
 336 try {
 337                 //update location of seqs in smallRow since they move to smallCol now
 338                 string names = list->get(clusterArray[smallRow].smallChild);
 339                 while (names.find_first_of(',') != -1) {
 340                         //get name from bin
 341                         string name = names.substr(0,names.find_first_of(','));
 342                         //save name and bin number
 343                         seq2Bin[name] = clusterArray[smallCol].smallChild;
 344                         names = names.substr(names.find_first_of(',')+1, names.length());
 345                 }
 346
 347                 //get last name
 348                 seq2Bin[names] = clusterArray[smallCol].smallChild;
 349         }
 350         catch(exception& e) {
 351                 errorOut(e, "HCluster", "updateMap");
 352                 exit(1);
 353         }
 354 }
 355 //**********************************************************************************************************************
 356 vector<seqDist> HCluster::getSeqs(){
 357         try {
 358                 vector<seqDist> sameSeqs;
 359
 360                 if(method != "average") {
 361                         sameSeqs = getSeqsFNNN();
 362                 }else{
 363                         sameSeqs = getSeqsAN();
 364                 }
 365
 366                 return sameSeqs;
 367         }
 368         catch(exception& e) {
 369                 errorOut(e, "HCluster", "getSeqs");
 370                 exit(1);
 371         }
 372 }
 373 //**********************************************************************************************************************
 374 vector<seqDist> HCluster::getSeqsFNNN(){
 375         try {
 376                 string firstName, secondName;
 377                 float distance, prevDistance;
 378                 vector<seqDist> sameSeqs;
 379                 prevDistance = -1;
 380
 381                 //if you are not at the beginning of the file
 382                 if (exitedBreak) {
 383                         sameSeqs.push_back(next);
 384                         prevDistance = next.dist;
 385                         exitedBreak = false;
 386                 }
 387
 388                 //get entry
 389                 while (!filehandle.eof()) {
 390
 391                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 392
 393                         //save first one
 394                         if (prevDistance == -1) { prevDistance = distance; }
 395
 396                         map<string,int>::iterator itA = nameMap->find(firstName);
 397                         map<string,int>::iterator itB = nameMap->find(secondName);
 398                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 399                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 400
 401                         //using cutoff
 402                         if (distance > cutoff) { break; }
 403
 404                         if (distance != -1) { //-1 means skip me
 405
 406                                 //are the distances the same
 407                                 if (distance == prevDistance) { //save in vector
 408                                         seqDist temp(itA->second, itB->second, distance);
 409                                         sameSeqs.push_back(temp);
 410                                         exitedBreak = false;
 411                                 }else{
 412                                         next.seq1 = itA->second;
 413                                         next.seq2 = itB->second;
 414                                         next.dist = distance;
 415                                         exitedBreak = true;
 416                                         break;
 417                                 }
 418                         }
 419                 }
 420
 421                 //rndomize matching dists
 422                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 423
 424                 return sameSeqs;
 425         }
 426         catch(exception& e) {
 427                 errorOut(e, "HCluster", "getSeqsFNNN");
 428                 exit(1);
 429         }
 430 }
 431 //**********************************************************************************************************************
 432 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
 433 vector<seqDist> HCluster::getSeqsAN(){
 434         try {
 435                 int firstName, secondName;
 436                 float prevDistance;
 437                 vector<seqDist> sameSeqs;
 438                 prevDistance = -1;
 439
 440                 openInputFile(distfile, filehandle, "no error");
 441
 442                 //is the smallest value in mergedMin or the distfile?
 443                 float mergedMinDist = 10000;
 444                 float distance = 10000;
 445                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
 446
 447                 if (!filehandle.eof()) {
 448                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 449                         //save first one
 450                         if (prevDistance == -1) { prevDistance = distance; }
 451                         if (distance != -1) { //-1 means skip me
 452                                 seqDist temp(firstName, secondName, distance);
 453                                 sameSeqs.push_back(temp);
 454                         }
 455                 }
 456
 457                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
 458                         //remove distance we saved from file
 459                         sameSeqs.clear();
 460                         prevDistance = mergedMinDist;
 461
 462                         for (int i = 0; i < mergedMin.size(); i++) {
 463                                 if (mergedMin[i].dist == prevDistance) {
 464                                         sameSeqs.push_back(mergedMin[i]);
 465                                 }else { break; }
 466                         }
 467                 }else{ //get minimum from file
 468                         //get entry
 469                         while (!filehandle.eof()) {
 470
 471                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 472
 473                                 if (prevDistance == -1) { prevDistance = distance; }
 474
 475                                 if (distance != -1) { //-1 means skip me
 476                                         //are the distances the same
 477                                         if (distance == prevDistance) { //save in vector
 478                                                 seqDist temp(firstName, secondName, distance);
 479                                                 sameSeqs.push_back(temp);
 480                                         }else{
 481                                                 break;
 482                                         }
 483                                 }
 484                         }
 485                 }
 486                 filehandle.close();
 487
 488                 //randomize matching dists
 489                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 490
 491                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
 492                 vector<seqDist> temp;
 493                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
 494
 495                 return temp;
 496         }
 497         catch(exception& e) {
 498                 errorOut(e, "HCluster", "getSeqsAN");
 499                 exit(1);
 500         }
 501 }
 502
 503 /***********************************************************************/
 504 void HCluster::combineFile() {
 505         try {
 506                 int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
 507                 char* inputBuffer;
 508                 inputBuffer = new char[bufferSize];
 509                 size_t numRead;
 510
 511                 string tempDistFile = distfile + ".temp";
 512                 ofstream out;
 513                 openOutputFile(tempDistFile, out);
 514
 515                 FILE* in;
 516                 in = fopen(distfile.c_str(), "rb");
 517
 518                 int first, second;
 519                 float dist;
 520
 521                 vector< map<int, float> > smallRowColValues;
 522                 smallRowColValues.resize(2);  //0 = row, 1 = col
 523                 int count = 0;
 524
 525                 //go through file pulling out distances related to rows merging
 526                 //if mergedMin contains distances add those back into file
 527                 bool done = false;
 528                 partialDist = "";
 529                 while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 530 //cout << "number of char read = " << numRead << endl;
 531 //cout << inputBuffer << endl;
 532                         if (numRead < bufferSize) { done = true; }
 533
 534                         //parse input into individual distances
 535                         int spot = 0;
 536                         string outputString = "";
 537                         while(spot < numRead) {
 538         //cout << "spot = " << spot << endl;
 539                            seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
 540
 541                            //you read a partial distance
 542                            if (nextDist.seq1 == -1) { break;  }
 543
 544                            first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
 545         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
 546                            //since file is sorted and mergedMin is sorted
 547                            //you can put the smallest distance from each through the code below and keep the file sorted
 548
 549                            //while there are still values in mergedMin that are smaller than the distance read from file
 550                            while (count < mergedMin.size())  {
 551
 552                                         //is the distance in mergedMin smaller than from the file
 553                                         if (mergedMin[count].dist < dist) {
 554                                         //is this a distance related to the columns merging?
 555                                         //if yes, save in memory
 556                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 557                                                 }else if (mergedMin[count].seq1 == smallCol) {
 558                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 559                                                 }else if (mergedMin[count].seq2 == smallCol) {
 560                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 561                                                 }else if (mergedMin[count].seq1 == smallRow) {
 562                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 563                                                 }else if (mergedMin[count].seq2 == smallRow) {
 564                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 565                                                 }else { //if no, write to temp file
 566                                                         outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
 567                                                 }
 568                                                 count++;
 569                                         }else{   break; }
 570                            }
 571
 572                            //is this a distance related to the columns merging?
 573                            //if yes, save in memory
 574                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
 575                            }else if (first == smallCol) {
 576                                         smallRowColValues[1][second] = dist;
 577                            }else if (second == smallCol) {
 578                                         smallRowColValues[1][first] = dist;
 579                            }else if (first == smallRow) {
 580                                         smallRowColValues[0][second] = dist;
 581                            }else if (second == smallRow) {
 582                                         smallRowColValues[0][first] = dist;
 583
 584                            }else { //if no, write to temp file
 585                                         outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
 586                            }
 587                         }
 588
 589                         out << outputString;
 590                         if(done) { break; }
 591                 }
 592                 fclose(in);
 593
 594                 //if values in mergedMin are larger than the the largest in file then
 595                 while (count < mergedMin.size())  {
 596                         //is this a distance related to the columns merging?
 597                         //if yes, save in memory
 598                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 599                         }else if (mergedMin[count].seq1 == smallCol) {
 600                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 601                         }else if (mergedMin[count].seq2 == smallCol) {
 602                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 603                         }else if (mergedMin[count].seq1 == smallRow) {
 604                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 605                         }else if (mergedMin[count].seq2 == smallRow) {
 606                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 607
 608                         }else { //if no, write to temp file
 609                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 610                         }
 611                         count++;
 612                 }
 613                 out.close();
 614                 mergedMin.clear();
 615
 616                 //rename tempfile to distfile
 617                 remove(distfile.c_str());
 618                 rename(tempDistFile.c_str(), distfile.c_str());
 619
 620                 //merge clustered rows averaging the distances
 621                 map<int, float>::iterator itMerge;
 622                 map<int, float>::iterator it2Merge;
 623                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
 624                         //does smallRowColValues[1] have a distance to this seq too?
 625                         it2Merge = smallRowColValues[1].find(itMerge->first);
 626
 627                         float average;
 628                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
 629                                 //weighted average
 630                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
 631                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
 632                                 smallRowColValues[1].erase(it2Merge);
 633
 634                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
 635                                 mergedMin.push_back(temp);
 636                         }
 637                 }
 638
 639                 //sort merged values
 640                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
 641         }
 642         catch(exception& e) {
 643                 errorOut(e, "HCluster", "combineFile");
 644                 exit(1);
 645         }
 646 }
 647 /***********************************************************************/
 648 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 649         try {
 650                 seqDist next;
 651                 int indexBefore = index;
 652                 string first, second, distance;
 653                 first = ""; second = ""; distance = "";
 654                 int tabCount = 0;
 655 //cout << "partial = " << partialDist << endl;
 656                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
 657                         for (int i = 0; i < partialDist.size(); i++) {
 658                                 if (tabCount == 0) {
 659                                         if (partialDist[i] == '\t') { tabCount++; }
 660                                         else {  first += partialDist[i];        }
 661                                 }else if (tabCount == 1) {
 662                                         if (partialDist[i] == '\t') { tabCount++; }
 663                                         else {  second += partialDist[i];       }
 664                                 }else if (tabCount == 2) {
 665                                         distance +=  partialDist[i];
 666                                 }
 667                         }
 668                         partialDist = "";
 669                 }
 670
 671                 //try to get another distance
 672                 bool gotDist = false;
 673                 while (index < size) {
 674                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
 675                                 gotDist = true;
 676
 677                                 //gobble space
 678                                 while (index < size) {
 679                                         if (isspace(buffer[index])) { index++; }
 680                                         else { break; }
 681                                 }
 682                                 break;
 683                         }else{
 684                                 if (tabCount == 0) {
 685                                         if (buffer[index] == '\t') { tabCount++; }
 686                                         else {  first += buffer[index]; }
 687                                 }else if (tabCount == 1) {
 688                                         if (buffer[index] == '\t') { tabCount++; }
 689                                         else {  second += buffer[index];        }
 690                                 }else if (tabCount == 2) {
 691                                         distance +=  buffer[index];
 692                                 }
 693                                 index++;
 694                         }
 695                 }
 696
 697                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
 698                 //then you want to save the partial distance.
 699                 if (!gotDist) {
 700                         for (int i = indexBefore; i < size; i++) {
 701                                 partialDist += buffer[i];
 702                         }
 703                         index = size + 1;
 704                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
 705                 }else{
 706                         int firstname, secondname;
 707                         float dist;
 708
 709                         convert(first, firstname);
 710                         convert(second, secondname);
 711                         convert(distance, dist);
 712
 713                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
 714                 }
 715
 716                 return next;
 717         }
 718         catch(exception& e) {
 719                 errorOut(e, "HCluster", "getNextDist");
 720                 exit(1);
 721         }
 722 }
 723 /***********************************************************************/
 724 void HCluster::processFile() {
 725         try {
 726                 string firstName, secondName;
 727                 float distance;
 728
 729                 ifstream in;
 730                 openInputFile(distfile, in);
 731
 732                 ofstream out;
 733                 string outTemp = distfile + ".temp";
 734                 openOutputFile(outTemp, out);
 735
 736                 //get entry
 737                 while (!in.eof()) {
 738
 739                         in >> firstName >> secondName >> distance;    gobble(in);
 740
 741                         map<string,int>::iterator itA = nameMap->find(firstName);
 742                         map<string,int>::iterator itB = nameMap->find(secondName);
 743                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 744                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 745
 746                         //using cutoff
 747                         if (distance > cutoff) { break; }
 748
 749                         if (distance != -1) { //-1 means skip me
 750                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
 751                         }
 752                 }
 753
 754                 in.close();
 755                 out.close();
 756
 757                 remove(distfile.c_str());
 758                 rename(outTemp.c_str(), distfile.c_str());
 759         }
 760         catch(exception& e) {
 761                 errorOut(e, "HCluster", "processFile");
 762                 exit(1);
 763         }
 764 }
 765 /***********************************************************************/
 766
 767
 768
 769
 770
 771
 772
 773