hcluster.cpp

   1 /*
   2  *  hcluster.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 10/13/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "hcluster.h"
  11 #include "rabundvector.hpp"
  12 #include "listvector.hpp"
  13 #include "sparsematrix.hpp"
  14
  15 /***********************************************************************/
  16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
  17         try {
  18                 m = MothurOut::getInstance();
  19                 mapWanted = false;
  20                 exitedBreak = false;
  21                 numSeqs = list->getNumSeqs();
  22
  23                 //initialize cluster array
  24                 for (int i = 0; i < numSeqs; i++) {
  25                         clusterNode temp(1, -1, i);
  26                         clusterArray.push_back(temp);
  27                 }
  28
  29                 if (method != "average") {
  30                         openInputFile(distfile, filehandle);
  31                 }else{
  32                         processFile();
  33                 }
  34         }
  35         catch(exception& e) {
  36                 m->errorOut(e, "HCluster", "HCluster");
  37                 exit(1);
  38         }
  39 }
  40 /***********************************************************************/
  41
  42 void HCluster::clusterBins(){
  43         try {
  44                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
  45
  46                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
  47                 rabund->set(clusterArray[smallRow].smallChild, 0);
  48                 rabund->setLabel(toString(smallDist));
  49
  50                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
  51         }
  52         catch(exception& e) {
  53                 m->errorOut(e, "HCluster", "clusterBins");
  54                 exit(1);
  55         }
  56
  57
  58 }
  59
  60 /***********************************************************************/
  61
  62 void HCluster::clusterNames(){
  63         try {
  64                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
  65                 if (mapWanted) {  updateMap();  }
  66
  67                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
  68                 list->set(clusterArray[smallRow].smallChild, "");
  69                 list->setLabel(toString(smallDist));
  70
  71                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
  72
  73     }
  74         catch(exception& e) {
  75                 m->errorOut(e, "HCluster", "clusterNames");
  76                 exit(1);
  77         }
  78
  79 }
  80 /***********************************************************************/
  81 int HCluster::getUpmostParent(int node){
  82         try {
  83                 while (clusterArray[node].parent != -1) {
  84                         node = clusterArray[node].parent;
  85                 }
  86
  87                 return node;
  88         }
  89         catch(exception& e) {
  90                 m->errorOut(e, "HCluster", "getUpmostParent");
  91                 exit(1);
  92         }
  93 }
  94 /***********************************************************************/
  95 void HCluster::printInfo(){
  96         try {
  97
  98                 cout << "link table" << endl;
  99                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
 100                         cout << itActive->first << " = " << itActive->second << endl;
 101                 }
 102                 cout << endl;
 103                 for (int i = 0; i < linkTable.size(); i++) {
 104                         cout << i << '\t';
 105                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
 106                                 cout << it->first << '-' << it->second << '\t' ;
 107                         }
 108                         cout << endl;
 109                 }
 110                 cout << endl << "clusterArray" << endl;
 111
 112                 for (int i = 0; i < clusterArray.size(); i++) {
 113                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
 114                 }
 115                 cout << endl;
 116
 117
 118         }
 119         catch(exception& e) {
 120                 m->errorOut(e, "HCluster", "getUpmostParent");
 121                 exit(1);
 122         }
 123 }
 124 /***********************************************************************/
 125 int HCluster::makeActive() {
 126         try {
 127                 int linkValue = 1;
 128
 129                 itActive = activeLinks.find(smallRow);
 130                 it2Active = activeLinks.find(smallCol);
 131
 132                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
 133                         int size = linkTable.size();
 134                         map<int, int> temp; map<int, int> temp2;
 135
 136                         //add link to eachother
 137                         temp[smallRow] = 1;                                                     //         1    2
 138                         temp2[smallCol] = 1;                                            // 1   0        1
 139                                                                                                                 // 2   1        0
 140                         linkTable.push_back(temp);
 141                         linkTable.push_back(temp2);
 142
 143                         //add to activeLinks
 144                         activeLinks[smallRow] = size;
 145                         activeLinks[smallCol] = size+1;
 146
 147                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
 148                          int size = linkTable.size();
 149                          int alreadyActiveRow = itActive->second;
 150                          map<int, int> temp;
 151
 152                         //add link to eachother
 153                         temp[smallRow] = 1;                                                                     //         6    2       3       5
 154                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 155                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
 156                                                                                                                                 // 3   2        1       0       0
 157                                                                                                                                 // 5   0    1   0   0
 158                         //add to activeLinks
 159                         activeLinks[smallCol] = size;
 160
 161                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
 162                          int size = linkTable.size();
 163                          int alreadyActiveCol = it2Active->second;
 164                          map<int, int> temp;
 165
 166                         //add link to eachother
 167                         temp[smallCol] = 1;                                                                     //         6    2       3       5
 168                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 169                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
 170                                                                                                                                 // 3   2        1       0       0
 171                                                                                                                                 // 5   0    1   0   0
 172                         //add to activeLinks
 173                         activeLinks[smallRow] = size;
 174
 175                 }else { //both are active so add one
 176                         int row = itActive->second;
 177                         int col = it2Active->second;
 178
 179
 180                         linkTable[row][smallCol]++;
 181                         linkTable[col][smallRow]++;
 182                         linkValue = linkTable[row][smallCol];
 183                 }
 184
 185                 return linkValue;
 186         }
 187         catch(exception& e) {
 188                 m->errorOut(e, "HCluster", "makeActive");
 189                 exit(1);
 190         }
 191 }
 192 /***********************************************************************/
 193 void HCluster::updateArrayandLinkTable() {
 194         try {
 195                 //if cluster was made update clusterArray and linkTable
 196                 int size = clusterArray.size();
 197
 198                 //add new node
 199                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
 200                 clusterArray.push_back(temp);
 201
 202                 //update child nodes
 203                 clusterArray[smallRow].parent = size;
 204                 clusterArray[smallCol].parent = size;
 205
 206                 if (method == "furthest") {
 207
 208                         //update linkTable by merging clustered rows and columns
 209                         int rowSpot = activeLinks[smallRow];
 210                         int colSpot = activeLinks[smallCol];
 211
 212                         //fix old rows
 213                         for (int i = 0; i < linkTable.size(); i++) {
 214                                 //check if they are in map
 215                                 it = linkTable[i].find(smallRow);
 216                                 it2 = linkTable[i].find(smallCol);
 217
 218                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
 219                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
 220                                         linkTable[i].erase(smallCol); //delete col row
 221                                         linkTable[i].erase(smallRow); //delete col row
 222                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
 223                                         linkTable[i][size] = linkTable[i][smallCol];
 224                                         linkTable[i].erase(smallCol); //delete col
 225                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
 226                                         linkTable[i][size] = linkTable[i][smallRow];
 227                                         linkTable[i].erase(smallRow); //delete col
 228                                 }
 229                         }
 230
 231                         //merge their values
 232                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
 233                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
 234
 235                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
 236                                         linkTable[colSpot][it->first] = it->second;
 237                                 }else { //merge them
 238                                         linkTable[colSpot][it->first] = it->second + it2->second;
 239                                 }
 240                         }
 241
 242                         linkTable[colSpot].erase(size);
 243                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
 244
 245                         //update activerows
 246                         activeLinks.erase(smallRow);
 247                         activeLinks.erase(smallCol);
 248                         activeLinks[size] = colSpot;
 249
 250                         //adjust everybody elses spot since you deleted - time vs. space
 251                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
 252                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
 253                         }
 254                 }
 255         }
 256         catch(exception& e) {
 257                 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
 258                 exit(1);
 259         }
 260 }
 261 /***********************************************************************/
 262 bool HCluster::update(int row, int col, float distance){
 263         try {
 264                 bool cluster = false;
 265                 smallRow = row;
 266                 smallCol = col;
 267                 smallDist = distance;
 268
 269                 //find upmost parent of row and col
 270                 smallRow = getUpmostParent(smallRow);
 271                 smallCol = getUpmostParent(smallCol);
 272
 273                 //you don't want to cluster with yourself
 274                 if (smallRow != smallCol) {
 275
 276                         if (method != "average") {
 277                                 //can we cluster???
 278                                 if (method == "nearest") { cluster = true;  }
 279                                 else{ //assume furthest
 280                                         //are they active in the link table
 281                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
 282                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
 283                                 }
 284
 285                                 if (cluster) {
 286                                         updateArrayandLinkTable();
 287                                         clusterBins();
 288                                         clusterNames();
 289                                 }
 290                         }else {
 291                                 cluster = true;
 292                                 updateArrayandLinkTable();
 293                                 clusterBins();
 294                                 clusterNames();
 295                                 combineFile();
 296                         }
 297                 }
 298
 299                 return cluster;
 300                 //printInfo();
 301         }
 302         catch(exception& e) {
 303                 m->errorOut(e, "HCluster", "update");
 304                 exit(1);
 305         }
 306 }
 307 /***********************************************************************/
 308 void HCluster::setMapWanted(bool ms)  {
 309         try {
 310                 mapWanted = ms;
 311
 312                 //initialize map
 313                 for (int i = 0; i < list->getNumBins(); i++) {
 314
 315                         //parse bin
 316                         string names = list->get(i);
 317                         while (names.find_first_of(',') != -1) {
 318                                 //get name from bin
 319                                 string name = names.substr(0,names.find_first_of(','));
 320                                 //save name and bin number
 321                                 seq2Bin[name] = i;
 322                                 names = names.substr(names.find_first_of(',')+1, names.length());
 323                         }
 324
 325                         //get last name
 326                         seq2Bin[names] = i;
 327                 }
 328
 329         }
 330         catch(exception& e) {
 331                 m->errorOut(e, "HCluster", "setMapWanted");
 332                 exit(1);
 333         }
 334 }
 335 /***********************************************************************/
 336 void HCluster::updateMap() {
 337 try {
 338                 //update location of seqs in smallRow since they move to smallCol now
 339                 string names = list->get(clusterArray[smallRow].smallChild);
 340                 while (names.find_first_of(',') != -1) {
 341                         //get name from bin
 342                         string name = names.substr(0,names.find_first_of(','));
 343                         //save name and bin number
 344                         seq2Bin[name] = clusterArray[smallCol].smallChild;
 345                         names = names.substr(names.find_first_of(',')+1, names.length());
 346                 }
 347
 348                 //get last name
 349                 seq2Bin[names] = clusterArray[smallCol].smallChild;
 350         }
 351         catch(exception& e) {
 352                 m->errorOut(e, "HCluster", "updateMap");
 353                 exit(1);
 354         }
 355 }
 356 //**********************************************************************************************************************
 357 vector<seqDist> HCluster::getSeqs(){
 358         try {
 359                 vector<seqDist> sameSeqs;
 360
 361                 if(method != "average") {
 362                         sameSeqs = getSeqsFNNN();
 363                 }else{
 364                         sameSeqs = getSeqsAN();
 365                 }
 366
 367                 return sameSeqs;
 368         }
 369         catch(exception& e) {
 370                 m->errorOut(e, "HCluster", "getSeqs");
 371                 exit(1);
 372         }
 373 }
 374 //**********************************************************************************************************************
 375 vector<seqDist> HCluster::getSeqsFNNN(){
 376         try {
 377                 string firstName, secondName;
 378                 float distance, prevDistance;
 379                 vector<seqDist> sameSeqs;
 380                 prevDistance = -1;
 381
 382                 //if you are not at the beginning of the file
 383                 if (exitedBreak) {
 384                         sameSeqs.push_back(next);
 385                         prevDistance = next.dist;
 386                         exitedBreak = false;
 387                 }
 388
 389                 //get entry
 390                 while (!filehandle.eof()) {
 391
 392                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 393
 394                         //save first one
 395                         if (prevDistance == -1) { prevDistance = distance; }
 396
 397                         map<string,int>::iterator itA = nameMap->find(firstName);
 398                         map<string,int>::iterator itB = nameMap->find(secondName);
 399                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 400                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 401
 402                         //using cutoff
 403                         if (distance > cutoff) { break; }
 404
 405                         if (distance != -1) { //-1 means skip me
 406
 407                                 //are the distances the same
 408                                 if (distance == prevDistance) { //save in vector
 409                                         seqDist temp(itA->second, itB->second, distance);
 410                                         sameSeqs.push_back(temp);
 411                                         exitedBreak = false;
 412                                 }else{
 413                                         next.seq1 = itA->second;
 414                                         next.seq2 = itB->second;
 415                                         next.dist = distance;
 416                                         exitedBreak = true;
 417                                         break;
 418                                 }
 419                         }
 420                 }
 421
 422                 //rndomize matching dists
 423                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 424
 425                 return sameSeqs;
 426         }
 427         catch(exception& e) {
 428                 m->errorOut(e, "HCluster", "getSeqsFNNN");
 429                 exit(1);
 430         }
 431 }
 432 //**********************************************************************************************************************
 433 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
 434 vector<seqDist> HCluster::getSeqsAN(){
 435         try {
 436                 int firstName, secondName;
 437                 float prevDistance;
 438                 vector<seqDist> sameSeqs;
 439                 prevDistance = -1;
 440
 441                 openInputFile(distfile, filehandle, "no error");
 442
 443                 //is the smallest value in mergedMin or the distfile?
 444                 float mergedMinDist = 10000;
 445                 float distance = 10000;
 446                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
 447
 448                 if (!filehandle.eof()) {
 449                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 450                         //save first one
 451                         if (prevDistance == -1) { prevDistance = distance; }
 452                         if (distance != -1) { //-1 means skip me
 453                                 seqDist temp(firstName, secondName, distance);
 454                                 sameSeqs.push_back(temp);
 455                         }else{ distance = 10000; }
 456                 }
 457
 458                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
 459                         //remove distance we saved from file
 460                         sameSeqs.clear();
 461                         prevDistance = mergedMinDist;
 462
 463                         for (int i = 0; i < mergedMin.size(); i++) {
 464                                 if (mergedMin[i].dist == prevDistance) {
 465                                         sameSeqs.push_back(mergedMin[i]);
 466                                 }else { break; }
 467                         }
 468                 }else{ //get minimum from file
 469                         //get entry
 470                         while (!filehandle.eof()) {
 471
 472                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 473
 474                                 if (prevDistance == -1) { prevDistance = distance; }
 475
 476                                 if (distance != -1) { //-1 means skip me
 477                                         //are the distances the same
 478                                         if (distance == prevDistance) { //save in vector
 479                                                 seqDist temp(firstName, secondName, distance);
 480                                                 sameSeqs.push_back(temp);
 481                                         }else{
 482                                                 break;
 483                                         }
 484                                 }
 485                         }
 486                 }
 487                 filehandle.close();
 488
 489                 //randomize matching dists
 490                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 491
 492                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
 493                 vector<seqDist> temp;
 494                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
 495
 496                 return temp;
 497         }
 498         catch(exception& e) {
 499                 m->errorOut(e, "HCluster", "getSeqsAN");
 500                 exit(1);
 501         }
 502 }
 503
 504 /***********************************************************************/
 505 int HCluster::combineFile() {
 506         try {
 507                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
 508                 //char* inputBuffer;
 509                 //inputBuffer = new char[bufferSize];
 510                 //size_t numRead;
 511
 512                 string tempDistFile = distfile + ".temp";
 513                 ofstream out;
 514                 openOutputFile(tempDistFile, out);
 515
 516                 //FILE* in;
 517                 //in = fopen(distfile.c_str(), "rb");
 518
 519                 ifstream in;
 520                 openInputFile(distfile, in);
 521
 522                 int first, second;
 523                 float dist;
 524
 525                 vector< map<int, float> > smallRowColValues;
 526                 smallRowColValues.resize(2);  //0 = row, 1 = col
 527                 int count = 0;
 528
 529                 //go through file pulling out distances related to rows merging
 530                 //if mergedMin contains distances add those back into file
 531                 //bool done = false;
 532                 //partialDist = "";
 533                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 534 //cout << "number of char read = " << numRead << endl;
 535 //cout << inputBuffer << endl;
 536                         //if (numRead < bufferSize) { done = true; }
 537
 538                         //parse input into individual distances
 539                         //int spot = 0;
 540                         //string outputString = "";
 541                         //while(spot < numRead) {
 542         //cout << "spot = " << spot << endl;
 543                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
 544
 545                            //you read a partial distance
 546                           // if (nextDist.seq1 == -1) { break;  }
 547                         while (!in.eof()) {
 548                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
 549         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
 550                            //since file is sorted and mergedMin is sorted
 551                            //you can put the smallest distance from each through the code below and keep the file sorted
 552
 553                            in >> first >> second >> dist; gobble(in);
 554
 555                            if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
 556
 557                            //while there are still values in mergedMin that are smaller than the distance read from file
 558                            while (count < mergedMin.size())  {
 559
 560                                         //is the distance in mergedMin smaller than from the file
 561                                         if (mergedMin[count].dist < dist) {
 562                                         //is this a distance related to the columns merging?
 563                                         //if yes, save in memory
 564                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 565                                                 }else if (mergedMin[count].seq1 == smallCol) {
 566                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 567                                                 }else if (mergedMin[count].seq2 == smallCol) {
 568                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 569                                                 }else if (mergedMin[count].seq1 == smallRow) {
 570                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 571                                                 }else if (mergedMin[count].seq2 == smallRow) {
 572                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 573                                                 }else { //if no, write to temp file
 574                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
 575                                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 576                                                 }
 577                                                 count++;
 578                                         }else{   break; }
 579                            }
 580
 581                            //is this a distance related to the columns merging?
 582                            //if yes, save in memory
 583                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
 584                            }else if (first == smallCol) {
 585                                         smallRowColValues[1][second] = dist;
 586                            }else if (second == smallCol) {
 587                                         smallRowColValues[1][first] = dist;
 588                            }else if (first == smallRow) {
 589                                         smallRowColValues[0][second] = dist;
 590                            }else if (second == smallRow) {
 591                                         smallRowColValues[0][first] = dist;
 592
 593                            }else { //if no, write to temp file
 594                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
 595                                         out << first << '\t' << second << '\t' << dist << endl;
 596                            }
 597                         }
 598
 599                         //out << outputString;
 600                         //if(done) { break; }
 601                 //}
 602                 //fclose(in);
 603                 in.close();
 604
 605                 //if values in mergedMin are larger than the the largest in file then
 606                 while (count < mergedMin.size())  {
 607                         //is this a distance related to the columns merging?
 608                         //if yes, save in memory
 609                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 610                         }else if (mergedMin[count].seq1 == smallCol) {
 611                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 612                         }else if (mergedMin[count].seq2 == smallCol) {
 613                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 614                         }else if (mergedMin[count].seq1 == smallRow) {
 615                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 616                         }else if (mergedMin[count].seq2 == smallRow) {
 617                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 618
 619                         }else { //if no, write to temp file
 620                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 621                         }
 622                         count++;
 623                 }
 624                 out.close();
 625                 mergedMin.clear();
 626
 627                 //rename tempfile to distfile
 628                 remove(distfile.c_str());
 629                 rename(tempDistFile.c_str(), distfile.c_str());
 630 //cout << "remove = "<< renameOK << " rename = " << ok << endl;
 631
 632                 //merge clustered rows averaging the distances
 633                 map<int, float>::iterator itMerge;
 634                 map<int, float>::iterator it2Merge;
 635                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
 636                         //does smallRowColValues[1] have a distance to this seq too?
 637                         it2Merge = smallRowColValues[1].find(itMerge->first);
 638
 639                         float average;
 640                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
 641                                 //weighted average
 642                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
 643                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
 644                                 smallRowColValues[1].erase(it2Merge);
 645
 646                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
 647                                 mergedMin.push_back(temp);
 648                         }
 649                 }
 650
 651                 //sort merged values
 652                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
 653
 654                 return 0;
 655         }
 656         catch(exception& e) {
 657                 m->errorOut(e, "HCluster", "combineFile");
 658                 exit(1);
 659         }
 660 }
 661 /***********************************************************************
 662 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 663         try {
 664                 seqDist next;
 665                 int indexBefore = index;
 666                 string first, second, distance;
 667                 first = ""; second = ""; distance = "";
 668                 int tabCount = 0;
 669 //cout << "partial = " << partialDist << endl;
 670                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
 671                         for (int i = 0; i < partialDist.size(); i++) {
 672                                 if (tabCount == 0) {
 673                                         if (partialDist[i] == '\t') { tabCount++; }
 674                                         else {  first += partialDist[i];        }
 675                                 }else if (tabCount == 1) {
 676                                         if (partialDist[i] == '\t') { tabCount++; }
 677                                         else {  second += partialDist[i];       }
 678                                 }else if (tabCount == 2) {
 679                                         distance +=  partialDist[i];
 680                                 }
 681                         }
 682                         partialDist = "";
 683                 }
 684
 685                 //try to get another distance
 686                 bool gotDist = false;
 687                 while (index < size) {
 688                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
 689                                 gotDist = true;
 690
 691                                 //gobble space
 692                                 while (index < size) {
 693                                         if (isspace(buffer[index])) { index++; }
 694                                         else { break; }
 695                                 }
 696                                 break;
 697                         }else{
 698                                 if (tabCount == 0) {
 699                                         if (buffer[index] == '\t') { tabCount++; }
 700                                         else {  first += buffer[index]; }
 701                                 }else if (tabCount == 1) {
 702                                         if (buffer[index] == '\t') { tabCount++; }
 703                                         else {  second += buffer[index];        }
 704                                 }else if (tabCount == 2) {
 705                                         distance +=  buffer[index];
 706                                 }
 707                                 index++;
 708                         }
 709                 }
 710
 711                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
 712                 //then you want to save the partial distance.
 713                 if (!gotDist) {
 714                         for (int i = indexBefore; i < size; i++) {
 715                                 partialDist += buffer[i];
 716                         }
 717                         index = size + 1;
 718                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
 719                 }else{
 720                         int firstname, secondname;
 721                         float dist;
 722
 723                         convert(first, firstname);
 724                         convert(second, secondname);
 725                         convert(distance, dist);
 726
 727                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
 728                 }
 729
 730                 return next;
 731         }
 732         catch(exception& e) {
 733                 m->errorOut(e, "HCluster", "getNextDist");
 734                 exit(1);
 735         }
 736 }
 737 /***********************************************************************/
 738 int HCluster::processFile() {
 739         try {
 740                 string firstName, secondName;
 741                 float distance;
 742
 743                 ifstream in;
 744                 openInputFile(distfile, in);
 745
 746                 ofstream out;
 747                 string outTemp = distfile + ".temp";
 748                 openOutputFile(outTemp, out);
 749
 750                 //get entry
 751                 while (!in.eof()) {
 752                         if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; }
 753
 754                         in >> firstName >> secondName >> distance;    gobble(in);
 755
 756                         map<string,int>::iterator itA = nameMap->find(firstName);
 757                         map<string,int>::iterator itB = nameMap->find(secondName);
 758                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 759                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 760
 761                         //using cutoff
 762                         if (distance > cutoff) { break; }
 763
 764                         if (distance != -1) { //-1 means skip me
 765                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
 766                         }
 767                 }
 768
 769                 in.close();
 770                 out.close();
 771
 772                 remove(distfile.c_str());
 773                 rename(outTemp.c_str(), distfile.c_str());
 774
 775                 return 0;
 776         }
 777         catch(exception& e) {
 778                 m->errorOut(e, "HCluster", "processFile");
 779                 exit(1);
 780         }
 781 }
 782 /***********************************************************************/
 783
 784
 785
 786
 787
 788
 789
 790