hcluster.cpp

   1 /*
   2  *  hcluster.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 10/13/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "hcluster.h"
  11 #include "rabundvector.hpp"
  12 #include "listvector.hpp"
  13 #include "sparsematrix.hpp"
  14
  15 /***********************************************************************/
  16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
  17         try {
  18                 mapWanted = false;
  19                 exitedBreak = false;
  20                 numSeqs = list->getNumSeqs();
  21
  22                 //initialize cluster array
  23                 for (int i = 0; i < numSeqs; i++) {
  24                         clusterNode temp(1, -1, i);
  25                         clusterArray.push_back(temp);
  26                 }
  27
  28                 if (method != "average") {
  29                         openInputFile(distfile, filehandle);
  30                 }else{ firstRead = true; }
  31         }
  32         catch(exception& e) {
  33                 errorOut(e, "HCluster", "HCluster");
  34                 exit(1);
  35         }
  36 }
  37 /***********************************************************************/
  38
  39 void HCluster::clusterBins(){
  40         try {
  41                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
  42
  43                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
  44                 rabund->set(clusterArray[smallRow].smallChild, 0);
  45                 rabund->setLabel(toString(smallDist));
  46
  47                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
  48         }
  49         catch(exception& e) {
  50                 errorOut(e, "HCluster", "clusterBins");
  51                 exit(1);
  52         }
  53
  54
  55 }
  56
  57 /***********************************************************************/
  58
  59 void HCluster::clusterNames(){
  60         try {
  61                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
  62                 if (mapWanted) {  updateMap();  }
  63
  64                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
  65                 list->set(clusterArray[smallRow].smallChild, "");
  66                 list->setLabel(toString(smallDist));
  67
  68                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
  69
  70     }
  71         catch(exception& e) {
  72                 errorOut(e, "HCluster", "clusterNames");
  73                 exit(1);
  74         }
  75
  76 }
  77 /***********************************************************************/
  78 int HCluster::getUpmostParent(int node){
  79         try {
  80                 while (clusterArray[node].parent != -1) {
  81                         node = clusterArray[node].parent;
  82                 }
  83
  84                 return node;
  85         }
  86         catch(exception& e) {
  87                 errorOut(e, "HCluster", "getUpmostParent");
  88                 exit(1);
  89         }
  90 }
  91 /***********************************************************************/
  92 void HCluster::printInfo(){
  93         try {
  94
  95                 cout << "link table" << endl;
  96                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
  97                         cout << itActive->first << " = " << itActive->second << endl;
  98                 }
  99                 cout << endl;
 100                 for (int i = 0; i < linkTable.size(); i++) {
 101                         cout << i << '\t';
 102                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
 103                                 cout << it->first << '-' << it->second << '\t' ;
 104                         }
 105                         cout << endl;
 106                 }
 107                 cout << endl << "clusterArray" << endl;
 108
 109                 for (int i = 0; i < clusterArray.size(); i++) {
 110                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
 111                 }
 112                 cout << endl;
 113
 114
 115         }
 116         catch(exception& e) {
 117                 errorOut(e, "HCluster", "getUpmostParent");
 118                 exit(1);
 119         }
 120 }
 121 /***********************************************************************/
 122 int HCluster::makeActive() {
 123         try {
 124                 int linkValue = 1;
 125
 126                 itActive = activeLinks.find(smallRow);
 127                 it2Active = activeLinks.find(smallCol);
 128
 129                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
 130                         int size = linkTable.size();
 131                         map<int, int> temp; map<int, int> temp2;
 132
 133                         //add link to eachother
 134                         temp[smallRow] = 1;                                                     //         1    2
 135                         temp2[smallCol] = 1;                                            // 1   0        1
 136                                                                                                                 // 2   1        0
 137                         linkTable.push_back(temp);
 138                         linkTable.push_back(temp2);
 139
 140                         //add to activeLinks
 141                         activeLinks[smallRow] = size;
 142                         activeLinks[smallCol] = size+1;
 143
 144                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
 145                          int size = linkTable.size();
 146                          int alreadyActiveRow = itActive->second;
 147                          map<int, int> temp;
 148
 149                         //add link to eachother
 150                         temp[smallRow] = 1;                                                                     //         6    2       3       5
 151                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 152                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
 153                                                                                                                                 // 3   2        1       0       0
 154                                                                                                                                 // 5   0    1   0   0
 155                         //add to activeLinks
 156                         activeLinks[smallCol] = size;
 157
 158                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
 159                          int size = linkTable.size();
 160                          int alreadyActiveCol = it2Active->second;
 161                          map<int, int> temp;
 162
 163                         //add link to eachother
 164                         temp[smallCol] = 1;                                                                     //         6    2       3       5
 165                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 166                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
 167                                                                                                                                 // 3   2        1       0       0
 168                                                                                                                                 // 5   0    1   0   0
 169                         //add to activeLinks
 170                         activeLinks[smallRow] = size;
 171
 172                 }else { //both are active so add one
 173                         int row = itActive->second;
 174                         int col = it2Active->second;
 175
 176
 177                         linkTable[row][smallCol]++;
 178                         linkTable[col][smallRow]++;
 179                         linkValue = linkTable[row][smallCol];
 180                 }
 181
 182                 return linkValue;
 183         }
 184         catch(exception& e) {
 185                 errorOut(e, "HCluster", "makeActive");
 186                 exit(1);
 187         }
 188 }
 189 /***********************************************************************/
 190 void HCluster::updateArrayandLinkTable() {
 191         try {
 192                 //if cluster was made update clusterArray and linkTable
 193                 int size = clusterArray.size();
 194
 195                 //add new node
 196                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
 197                 clusterArray.push_back(temp);
 198
 199                 //update child nodes
 200                 clusterArray[smallRow].parent = size;
 201                 clusterArray[smallCol].parent = size;
 202
 203                 if (method == "furthest") {
 204
 205                         //update linkTable by merging clustered rows and columns
 206                         int rowSpot = activeLinks[smallRow];
 207                         int colSpot = activeLinks[smallCol];
 208
 209                         //fix old rows
 210                         for (int i = 0; i < linkTable.size(); i++) {
 211                                 //check if they are in map
 212                                 it = linkTable[i].find(smallRow);
 213                                 it2 = linkTable[i].find(smallCol);
 214
 215                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
 216                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
 217                                         linkTable[i].erase(smallCol); //delete col row
 218                                         linkTable[i].erase(smallRow); //delete col row
 219                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
 220                                         linkTable[i][size] = linkTable[i][smallCol];
 221                                         linkTable[i].erase(smallCol); //delete col
 222                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
 223                                         linkTable[i][size] = linkTable[i][smallRow];
 224                                         linkTable[i].erase(smallRow); //delete col
 225                                 }
 226                         }
 227
 228                         //merge their values
 229                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
 230                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
 231
 232                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
 233                                         linkTable[colSpot][it->first] = it->second;
 234                                 }else { //merge them
 235                                         linkTable[colSpot][it->first] = it->second + it2->second;
 236                                 }
 237                         }
 238
 239                         linkTable[colSpot].erase(size);
 240                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
 241
 242                         //update activerows
 243                         activeLinks.erase(smallRow);
 244                         activeLinks.erase(smallCol);
 245                         activeLinks[size] = colSpot;
 246
 247                         //adjust everybody elses spot since you deleted - time vs. space
 248                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
 249                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
 250                         }
 251                 }
 252         }
 253         catch(exception& e) {
 254                 errorOut(e, "HCluster", "updateArrayandLinkTable");
 255                 exit(1);
 256         }
 257 }
 258 /***********************************************************************/
 259 bool HCluster::update(int row, int col, float distance){
 260         try {
 261                 bool cluster = false;
 262                 smallRow = row;
 263                 smallCol = col;
 264                 smallDist = distance;
 265
 266                 //find upmost parent of row and col
 267                 smallRow = getUpmostParent(smallRow);
 268                 smallCol = getUpmostParent(smallCol);
 269
 270                 //you don't want to cluster with yourself
 271                 if (smallRow != smallCol) {
 272
 273                         if (method != "average") {
 274                                 //can we cluster???
 275                                 if (method == "nearest") { cluster = true;  }
 276                                 else{ //assume furthest
 277                                         //are they active in the link table
 278                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
 279                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
 280                                 }
 281
 282                                 if (cluster) {
 283                                         updateArrayandLinkTable();
 284                                         clusterBins();
 285                                         clusterNames();
 286                                 }
 287                         }else {
 288                                 cluster = true;
 289                                 updateArrayandLinkTable();
 290                                 clusterBins();
 291                                 clusterNames();
 292                                 combineFile();
 293                         }
 294                 }
 295
 296                 return cluster;
 297                 //printInfo();
 298         }
 299         catch(exception& e) {
 300                 errorOut(e, "HCluster", "update");
 301                 exit(1);
 302         }
 303 }
 304 /***********************************************************************/
 305 void HCluster::setMapWanted(bool m)  {
 306         try {
 307                 mapWanted = m;
 308
 309                 //initialize map
 310                 for (int i = 0; i < list->getNumBins(); i++) {
 311
 312                         //parse bin
 313                         string names = list->get(i);
 314                         while (names.find_first_of(',') != -1) {
 315                                 //get name from bin
 316                                 string name = names.substr(0,names.find_first_of(','));
 317                                 //save name and bin number
 318                                 seq2Bin[name] = i;
 319                                 names = names.substr(names.find_first_of(',')+1, names.length());
 320                         }
 321
 322                         //get last name
 323                         seq2Bin[names] = i;
 324                 }
 325
 326         }
 327         catch(exception& e) {
 328                 errorOut(e, "HCluster", "setMapWanted");
 329                 exit(1);
 330         }
 331 }
 332 /***********************************************************************/
 333 void HCluster::updateMap() {
 334 try {
 335                 //update location of seqs in smallRow since they move to smallCol now
 336                 string names = list->get(clusterArray[smallRow].smallChild);
 337                 while (names.find_first_of(',') != -1) {
 338                         //get name from bin
 339                         string name = names.substr(0,names.find_first_of(','));
 340                         //save name and bin number
 341                         seq2Bin[name] = clusterArray[smallCol].smallChild;
 342                         names = names.substr(names.find_first_of(',')+1, names.length());
 343                 }
 344
 345                 //get last name
 346                 seq2Bin[names] = clusterArray[smallCol].smallChild;
 347         }
 348         catch(exception& e) {
 349                 errorOut(e, "HCluster", "updateMap");
 350                 exit(1);
 351         }
 352 }
 353 //**********************************************************************************************************************
 354 vector<seqDist> HCluster::getSeqs(){
 355         try {
 356                 vector<seqDist> sameSeqs;
 357
 358                 if(method != "average") {
 359                         sameSeqs = getSeqsFNNN();
 360                 }else{
 361                         if (firstRead) {        processFile();   }
 362                         sameSeqs = getSeqsAN();
 363                 }
 364
 365                 return sameSeqs;
 366         }
 367         catch(exception& e) {
 368                 errorOut(e, "HCluster", "getSeqs");
 369                 exit(1);
 370         }
 371 }
 372 //**********************************************************************************************************************
 373 vector<seqDist> HCluster::getSeqsFNNN(){
 374         try {
 375                 string firstName, secondName;
 376                 float distance, prevDistance;
 377                 vector<seqDist> sameSeqs;
 378                 prevDistance = -1;
 379
 380                 //if you are not at the beginning of the file
 381                 if (exitedBreak) {
 382                         sameSeqs.push_back(next);
 383                         prevDistance = next.dist;
 384                         exitedBreak = false;
 385                 }
 386
 387                 //get entry
 388                 while (!filehandle.eof()) {
 389
 390                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 391
 392                         //save first one
 393                         if (prevDistance == -1) { prevDistance = distance; }
 394
 395                         map<string,int>::iterator itA = nameMap->find(firstName);
 396                         map<string,int>::iterator itB = nameMap->find(secondName);
 397                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 398                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 399
 400                         //using cutoff
 401                         if (distance > cutoff) { break; }
 402
 403                         if (distance != -1) { //-1 means skip me
 404
 405                                 //are the distances the same
 406                                 if (distance == prevDistance) { //save in vector
 407                                         seqDist temp(itA->second, itB->second, distance);
 408                                         sameSeqs.push_back(temp);
 409                                         exitedBreak = false;
 410                                 }else{
 411                                         next.seq1 = itA->second;
 412                                         next.seq2 = itB->second;
 413                                         next.dist = distance;
 414                                         exitedBreak = true;
 415                                         break;
 416                                 }
 417                         }
 418                 }
 419
 420                 //rndomize matching dists
 421                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 422
 423                 return sameSeqs;
 424         }
 425         catch(exception& e) {
 426                 errorOut(e, "HCluster", "getSeqsFNNN");
 427                 exit(1);
 428         }
 429 }
 430 //**********************************************************************************************************************
 431 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
 432 vector<seqDist> HCluster::getSeqsAN(){
 433         try {
 434                 int firstName, secondName;
 435                 float prevDistance;
 436                 vector<seqDist> sameSeqs;
 437                 prevDistance = -1;
 438
 439                 openInputFile(distfile, filehandle, "no error");
 440
 441                 //is the smallest value in mergedMin or the distfile?
 442                 float mergedMinDist = 10000;
 443                 float distance = 10000;
 444                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
 445
 446                 if (!filehandle.eof()) {
 447                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 448                         //save first one
 449                         if (prevDistance == -1) { prevDistance = distance; }
 450                         if (distance != -1) { //-1 means skip me
 451                                 seqDist temp(firstName, secondName, distance);
 452                                 sameSeqs.push_back(temp);
 453                         }
 454                 }
 455
 456                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
 457                         //remove distance we saved from file
 458                         sameSeqs.clear();
 459                         prevDistance = mergedMinDist;
 460
 461                         for (int i = 0; i < mergedMin.size(); i++) {
 462                                 if (mergedMin[i].dist == prevDistance) {
 463                                         sameSeqs.push_back(mergedMin[i]);
 464                                 }else { break; }
 465                         }
 466                 }else{ //get minimum from file
 467                         //get entry
 468                         while (!filehandle.eof()) {
 469
 470                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 471
 472                                 if (prevDistance == -1) { prevDistance = distance; }
 473
 474                                 if (distance != -1) { //-1 means skip me
 475                                         //are the distances the same
 476                                         if (distance == prevDistance) { //save in vector
 477                                                 seqDist temp(firstName, secondName, distance);
 478                                                 sameSeqs.push_back(temp);
 479                                         }else{
 480                                                 break;
 481                                         }
 482                                 }
 483                         }
 484                 }
 485                 filehandle.close();
 486
 487                 //randomize matching dists
 488                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 489
 490                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
 491                 vector<seqDist> temp;
 492                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
 493
 494                 return temp;
 495         }
 496         catch(exception& e) {
 497                 errorOut(e, "HCluster", "getSeqsAN");
 498                 exit(1);
 499         }
 500 }
 501
 502 /***********************************************************************/
 503 void HCluster::combineFile() {
 504         try {
 505                 int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
 506                 char* inputBuffer;
 507                 inputBuffer = new char[bufferSize];
 508                 size_t numRead;
 509
 510                 string tempDistFile = distfile + ".temp";
 511                 ofstream out;
 512                 openOutputFile(tempDistFile, out);
 513
 514                 FILE* in;
 515                 in = fopen(distfile.c_str(), "rb");
 516
 517                 int first, second;
 518                 float dist;
 519
 520                 vector< map<int, float> > smallRowColValues;
 521                 smallRowColValues.resize(2);  //0 = row, 1 = col
 522                 int count = 0;
 523
 524                 //go through file pulling out distances related to rows merging
 525                 //if mergedMin contains distances add those back into file
 526                 bool done = false;
 527                 partialDist = "";
 528                 while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 529 //cout << "number of char read = " << numRead << endl;
 530 //cout << inputBuffer << endl;
 531                         if (numRead < bufferSize) { done = true; }
 532
 533                         //parse input into individual distances
 534                         int spot = 0;
 535                         string outputString = "";
 536                         while(spot < numRead) {
 537         //cout << "spot = " << spot << endl;
 538                            seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
 539
 540                            //you read a partial distance
 541                            if (nextDist.seq1 == -1) { break;  }
 542
 543                            first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
 544         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
 545                            //since file is sorted and mergedMin is sorted
 546                            //you can put the smallest distance from each through the code below and keep the file sorted
 547
 548                            //while there are still values in mergedMin that are smaller than the distance read from file
 549                            while (count < mergedMin.size())  {
 550
 551                                         //is the distance in mergedMin smaller than from the file
 552                                         if (mergedMin[count].dist < dist) {
 553                                         //is this a distance related to the columns merging?
 554                                         //if yes, save in memory
 555                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 556                                                 }else if (mergedMin[count].seq1 == smallCol) {
 557                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 558                                                 }else if (mergedMin[count].seq2 == smallCol) {
 559                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 560                                                 }else if (mergedMin[count].seq1 == smallRow) {
 561                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 562                                                 }else if (mergedMin[count].seq2 == smallRow) {
 563                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 564                                                 }else { //if no, write to temp file
 565                                                         outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
 566                                                 }
 567                                                 count++;
 568                                         }else{   break; }
 569                            }
 570
 571                            //is this a distance related to the columns merging?
 572                            //if yes, save in memory
 573                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
 574                            }else if (first == smallCol) {
 575                                         smallRowColValues[1][second] = dist;
 576                            }else if (second == smallCol) {
 577                                         smallRowColValues[1][first] = dist;
 578                            }else if (first == smallRow) {
 579                                         smallRowColValues[0][second] = dist;
 580                            }else if (second == smallRow) {
 581                                         smallRowColValues[0][first] = dist;
 582
 583                            }else { //if no, write to temp file
 584                                         outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
 585                            }
 586                         }
 587
 588                         out << outputString;
 589                         if(done) { break; }
 590                 }
 591                 fclose(in);
 592
 593                 //if values in mergedMin are larger than the the largest in file then
 594                 while (count < mergedMin.size())  {
 595                         //is this a distance related to the columns merging?
 596                         //if yes, save in memory
 597                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 598                         }else if (mergedMin[count].seq1 == smallCol) {
 599                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 600                         }else if (mergedMin[count].seq2 == smallCol) {
 601                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 602                         }else if (mergedMin[count].seq1 == smallRow) {
 603                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 604                         }else if (mergedMin[count].seq2 == smallRow) {
 605                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 606
 607                         }else { //if no, write to temp file
 608                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 609                         }
 610                         count++;
 611                 }
 612                 out.close();
 613                 mergedMin.clear();
 614
 615                 //rename tempfile to distfile
 616                 remove(distfile.c_str());
 617                 rename(tempDistFile.c_str(), distfile.c_str());
 618
 619                 //merge clustered rows averaging the distances
 620                 map<int, float>::iterator itMerge;
 621                 map<int, float>::iterator it2Merge;
 622                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
 623                         //does smallRowColValues[1] have a distance to this seq too?
 624                         it2Merge = smallRowColValues[1].find(itMerge->first);
 625
 626                         float average;
 627                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
 628                                 //weighted average
 629                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
 630                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
 631                                 smallRowColValues[1].erase(it2Merge);
 632
 633                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
 634                                 mergedMin.push_back(temp);
 635                         }
 636                 }
 637
 638                 //sort merged values
 639                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
 640         }
 641         catch(exception& e) {
 642                 errorOut(e, "HCluster", "combineFile");
 643                 exit(1);
 644         }
 645 }
 646 /***********************************************************************/
 647 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 648         try {
 649                 seqDist next;
 650                 int indexBefore = index;
 651                 string first, second, distance;
 652                 first = ""; second = ""; distance = "";
 653                 int tabCount = 0;
 654 //cout << "partial = " << partialDist << endl;
 655                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
 656                         for (int i = 0; i < partialDist.size(); i++) {
 657                                 if (tabCount == 0) {
 658                                         if (partialDist[i] == '\t') { tabCount++; }
 659                                         else {  first += partialDist[i];        }
 660                                 }else if (tabCount == 1) {
 661                                         if (partialDist[i] == '\t') { tabCount++; }
 662                                         else {  second += partialDist[i];       }
 663                                 }else if (tabCount == 2) {
 664                                         distance +=  partialDist[i];
 665                                 }
 666                         }
 667                         partialDist = "";
 668                 }
 669
 670                 //try to get another distance
 671                 bool gotDist = false;
 672                 while (index < size) {
 673                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
 674                                 gotDist = true;
 675
 676                                 //gobble space
 677                                 while (index < size) {
 678                                         if (isspace(buffer[index])) { index++; }
 679                                         else { break; }
 680                                 }
 681                                 break;
 682                         }else{
 683                                 if (tabCount == 0) {
 684                                         if (buffer[index] == '\t') { tabCount++; }
 685                                         else {  first += buffer[index]; }
 686                                 }else if (tabCount == 1) {
 687                                         if (buffer[index] == '\t') { tabCount++; }
 688                                         else {  second += buffer[index];        }
 689                                 }else if (tabCount == 2) {
 690                                         distance +=  buffer[index];
 691                                 }
 692                                 index++;
 693                         }
 694                 }
 695
 696                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
 697                 //then you want to save the partial distance.
 698                 if (!gotDist) {
 699                         for (int i = indexBefore; i < size; i++) {
 700                                 partialDist += buffer[i];
 701                         }
 702                         index = size + 1;
 703                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
 704                 }else{
 705                         int firstname, secondname;
 706                         float dist;
 707
 708                         convert(first, firstname);
 709                         convert(second, secondname);
 710                         convert(distance, dist);
 711
 712                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
 713                 }
 714
 715                 return next;
 716         }
 717         catch(exception& e) {
 718                 errorOut(e, "HCluster", "getNextDist");
 719                 exit(1);
 720         }
 721 }
 722 /***********************************************************************/
 723 void HCluster::processFile() {
 724         try {
 725                 string firstName, secondName;
 726                 float distance;
 727
 728                 ifstream in;
 729                 openInputFile(distfile, in);
 730
 731                 ofstream out;
 732                 string outTemp = distfile + ".temp";
 733                 openOutputFile(outTemp, out);
 734
 735                 //get entry
 736                 while (!in.eof()) {
 737
 738                         in >> firstName >> secondName >> distance;    gobble(in);
 739
 740                         map<string,int>::iterator itA = nameMap->find(firstName);
 741                         map<string,int>::iterator itB = nameMap->find(secondName);
 742                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 743                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 744
 745                         //using cutoff
 746                         if (distance > cutoff) { break; }
 747
 748                         if (distance != -1) { //-1 means skip me
 749                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
 750                         }
 751                 }
 752
 753                 in.close();
 754                 out.close();
 755
 756                 remove(distfile.c_str());
 757                 rename(outTemp.c_str(), distfile.c_str());
 758
 759                 firstRead = false;
 760         }
 761         catch(exception& e) {
 762                 errorOut(e, "HCluster", "processFile");
 763                 exit(1);
 764         }
 765 }
 766 /***********************************************************************/
 767
 768
 769
 770
 771
 772
 773
 774