hcluster.cpp

   1 /*
   2  *  hcluster.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 10/13/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "hcluster.h"
  11 #include "rabundvector.hpp"
  12 #include "listvector.hpp"
  13 #include "sparsematrix.hpp"
  14
  15 /***********************************************************************/
  16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
  17         try {
  18                 m = MothurOut::getInstance();
  19                 mapWanted = false;
  20                 exitedBreak = false;
  21                 numSeqs = list->getNumSeqs();
  22
  23                 //initialize cluster array
  24                 for (int i = 0; i < numSeqs; i++) {
  25                         clusterNode temp(1, -1, i);
  26                         clusterArray.push_back(temp);
  27                 }
  28
  29                 if ((method == "furthest") || (method == "nearest")) {
  30                         m->openInputFile(distfile, filehandle);
  31                 }else{
  32                         processFile();
  33                 }
  34         }
  35         catch(exception& e) {
  36                 m->errorOut(e, "HCluster", "HCluster");
  37                 exit(1);
  38         }
  39 }
  40 /***********************************************************************/
  41
  42 void HCluster::clusterBins(){
  43         try {
  44                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
  45
  46                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
  47                 rabund->set(clusterArray[smallRow].smallChild, 0);
  48                 rabund->setLabel(toString(smallDist));
  49
  50                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
  51         }
  52         catch(exception& e) {
  53                 m->errorOut(e, "HCluster", "clusterBins");
  54                 exit(1);
  55         }
  56
  57
  58 }
  59
  60 /***********************************************************************/
  61
  62 void HCluster::clusterNames(){
  63         try {
  64                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
  65                 if (mapWanted) {  updateMap();  }
  66
  67                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
  68                 list->set(clusterArray[smallRow].smallChild, "");
  69                 list->setLabel(toString(smallDist));
  70
  71                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
  72
  73     }
  74         catch(exception& e) {
  75                 m->errorOut(e, "HCluster", "clusterNames");
  76                 exit(1);
  77         }
  78
  79 }
  80 /***********************************************************************/
  81 int HCluster::getUpmostParent(int node){
  82         try {
  83                 while (clusterArray[node].parent != -1) {
  84                         node = clusterArray[node].parent;
  85                 }
  86
  87                 return node;
  88         }
  89         catch(exception& e) {
  90                 m->errorOut(e, "HCluster", "getUpmostParent");
  91                 exit(1);
  92         }
  93 }
  94 /***********************************************************************/
  95 void HCluster::printInfo(){
  96         try {
  97
  98                 cout << "link table" << endl;
  99                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
 100                         cout << itActive->first << " = " << itActive->second << endl;
 101                 }
 102                 cout << endl;
 103                 for (int i = 0; i < linkTable.size(); i++) {
 104                         cout << i << '\t';
 105                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
 106                                 cout << it->first << '-' << it->second << '\t' ;
 107                         }
 108                         cout << endl;
 109                 }
 110                 cout << endl << "clusterArray" << endl;
 111
 112                 for (int i = 0; i < clusterArray.size(); i++) {
 113                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
 114                 }
 115                 cout << endl;
 116
 117
 118         }
 119         catch(exception& e) {
 120                 m->errorOut(e, "HCluster", "getUpmostParent");
 121                 exit(1);
 122         }
 123 }
 124 /***********************************************************************/
 125 int HCluster::makeActive() {
 126         try {
 127                 int linkValue = 1;
 128
 129                 itActive = activeLinks.find(smallRow);
 130                 it2Active = activeLinks.find(smallCol);
 131
 132                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
 133                         int size = linkTable.size();
 134                         map<int, int> temp; map<int, int> temp2;
 135
 136                         //add link to eachother
 137                         temp[smallRow] = 1;                                                     //         1    2
 138                         temp2[smallCol] = 1;                                            // 1   0        1
 139                                                                                                                 // 2   1        0
 140                         linkTable.push_back(temp);
 141                         linkTable.push_back(temp2);
 142
 143                         //add to activeLinks
 144                         activeLinks[smallRow] = size;
 145                         activeLinks[smallCol] = size+1;
 146
 147                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
 148                          int size = linkTable.size();
 149                          int alreadyActiveRow = itActive->second;
 150                          map<int, int> temp;
 151
 152                         //add link to eachother
 153                         temp[smallRow] = 1;                                                                     //         6    2       3       5
 154                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 155                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
 156                                                                                                                                 // 3   2        1       0       0
 157                                                                                                                                 // 5   0    1   0   0
 158                         //add to activeLinks
 159                         activeLinks[smallCol] = size;
 160
 161                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
 162                          int size = linkTable.size();
 163                          int alreadyActiveCol = it2Active->second;
 164                          map<int, int> temp;
 165
 166                         //add link to eachother
 167                         temp[smallCol] = 1;                                                                     //         6    2       3       5
 168                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
 169                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
 170                                                                                                                                 // 3   2        1       0       0
 171                                                                                                                                 // 5   0    1   0   0
 172                         //add to activeLinks
 173                         activeLinks[smallRow] = size;
 174
 175                 }else { //both are active so add one
 176                         int row = itActive->second;
 177                         int col = it2Active->second;
 178
 179
 180                         linkTable[row][smallCol]++;
 181                         linkTable[col][smallRow]++;
 182                         linkValue = linkTable[row][smallCol];
 183                 }
 184
 185                 return linkValue;
 186         }
 187         catch(exception& e) {
 188                 m->errorOut(e, "HCluster", "makeActive");
 189                 exit(1);
 190         }
 191 }
 192 /***********************************************************************/
 193 void HCluster::updateArrayandLinkTable() {
 194         try {
 195                 //if cluster was made update clusterArray and linkTable
 196                 int size = clusterArray.size();
 197
 198                 //add new node
 199                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
 200                 clusterArray.push_back(temp);
 201
 202                 //update child nodes
 203                 clusterArray[smallRow].parent = size;
 204                 clusterArray[smallCol].parent = size;
 205
 206                 if (method == "furthest") {
 207
 208                         //update linkTable by merging clustered rows and columns
 209                         int rowSpot = activeLinks[smallRow];
 210                         int colSpot = activeLinks[smallCol];
 211
 212                         //fix old rows
 213                         for (int i = 0; i < linkTable.size(); i++) {
 214                                 //check if they are in map
 215                                 it = linkTable[i].find(smallRow);
 216                                 it2 = linkTable[i].find(smallCol);
 217
 218                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
 219                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
 220                                         linkTable[i].erase(smallCol); //delete col row
 221                                         linkTable[i].erase(smallRow); //delete col row
 222                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
 223                                         linkTable[i][size] = linkTable[i][smallCol];
 224                                         linkTable[i].erase(smallCol); //delete col
 225                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
 226                                         linkTable[i][size] = linkTable[i][smallRow];
 227                                         linkTable[i].erase(smallRow); //delete col
 228                                 }
 229                         }
 230
 231                         //merge their values
 232                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
 233                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
 234
 235                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
 236                                         linkTable[colSpot][it->first] = it->second;
 237                                 }else { //merge them
 238                                         linkTable[colSpot][it->first] = it->second + it2->second;
 239                                 }
 240                         }
 241
 242                         linkTable[colSpot].erase(size);
 243                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
 244
 245                         //update activerows
 246                         activeLinks.erase(smallRow);
 247                         activeLinks.erase(smallCol);
 248                         activeLinks[size] = colSpot;
 249
 250                         //adjust everybody elses spot since you deleted - time vs. space
 251                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
 252                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
 253                         }
 254                 }
 255         }
 256         catch(exception& e) {
 257                 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
 258                 exit(1);
 259         }
 260 }
 261 /***********************************************************************/
 262 double HCluster::update(int row, int col, float distance){
 263         try {
 264                 bool cluster = false;
 265                 smallRow = row;
 266                 smallCol = col;
 267                 smallDist = distance;
 268
 269                 //find upmost parent of row and col
 270                 smallRow = getUpmostParent(smallRow);
 271                 smallCol = getUpmostParent(smallCol);
 272
 273                 //you don't want to cluster with yourself
 274                 if (smallRow != smallCol) {
 275
 276                         if ((method == "furthest") || (method == "nearest")) {
 277                                 //can we cluster???
 278                                 if (method == "nearest") { cluster = true;  }
 279                                 else{ //assume furthest
 280                                         //are they active in the link table
 281                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
 282                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
 283                                 }
 284
 285                                 if (cluster) {
 286                                         updateArrayandLinkTable();
 287                                         clusterBins();
 288                                         clusterNames();
 289                                 }
 290                         }else {
 291                                 cluster = true;
 292                                 updateArrayandLinkTable();
 293                                 clusterBins();
 294                                 clusterNames();
 295                                 combineFile();
 296                         }
 297                 }
 298
 299                 return cutoff;
 300                 //printInfo();
 301         }
 302         catch(exception& e) {
 303                 m->errorOut(e, "HCluster", "update");
 304                 exit(1);
 305         }
 306 }
 307 /***********************************************************************/
 308 void HCluster::setMapWanted(bool ms)  {
 309         try {
 310                 mapWanted = ms;
 311
 312                 //initialize map
 313                 for (int i = 0; i < list->getNumBins(); i++) {
 314
 315                         //parse bin
 316                         string names = list->get(i);
 317                         while (names.find_first_of(',') != -1) {
 318                                 //get name from bin
 319                                 string name = names.substr(0,names.find_first_of(','));
 320                                 //save name and bin number
 321                                 seq2Bin[name] = i;
 322                                 names = names.substr(names.find_first_of(',')+1, names.length());
 323                         }
 324
 325                         //get last name
 326                         seq2Bin[names] = i;
 327                 }
 328
 329         }
 330         catch(exception& e) {
 331                 m->errorOut(e, "HCluster", "setMapWanted");
 332                 exit(1);
 333         }
 334 }
 335 /***********************************************************************/
 336 void HCluster::updateMap() {
 337 try {
 338                 //update location of seqs in smallRow since they move to smallCol now
 339                 string names = list->get(clusterArray[smallRow].smallChild);
 340                 while (names.find_first_of(',') != -1) {
 341                         //get name from bin
 342                         string name = names.substr(0,names.find_first_of(','));
 343                         //save name and bin number
 344                         seq2Bin[name] = clusterArray[smallCol].smallChild;
 345                         names = names.substr(names.find_first_of(',')+1, names.length());
 346                 }
 347
 348                 //get last name
 349                 seq2Bin[names] = clusterArray[smallCol].smallChild;
 350         }
 351         catch(exception& e) {
 352                 m->errorOut(e, "HCluster", "updateMap");
 353                 exit(1);
 354         }
 355 }
 356 //**********************************************************************************************************************
 357 vector<seqDist> HCluster::getSeqs(){
 358         try {
 359                 vector<seqDist> sameSeqs;
 360
 361                 if ((method == "furthest") || (method == "nearest")) {
 362                         sameSeqs = getSeqsFNNN();
 363                 }else{
 364                         sameSeqs = getSeqsAN();
 365                 }
 366
 367                 return sameSeqs;
 368         }
 369         catch(exception& e) {
 370                 m->errorOut(e, "HCluster", "getSeqs");
 371                 exit(1);
 372         }
 373 }
 374 //**********************************************************************************************************************
 375 vector<seqDist> HCluster::getSeqsFNNN(){
 376         try {
 377                 string firstName, secondName;
 378                 float distance, prevDistance;
 379                 vector<seqDist> sameSeqs;
 380                 prevDistance = -1;
 381
 382                 //if you are not at the beginning of the file
 383                 if (exitedBreak) {
 384                         sameSeqs.push_back(next);
 385                         prevDistance = next.dist;
 386                         exitedBreak = false;
 387                 }
 388
 389                 //get entry
 390                 while (!filehandle.eof()) {
 391
 392                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
 393
 394                         //save first one
 395                         if (prevDistance == -1) { prevDistance = distance; }
 396
 397                         map<string,int>::iterator itA = nameMap->find(firstName);
 398                         map<string,int>::iterator itB = nameMap->find(secondName);
 399                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
 400                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 401
 402                         //using cutoff
 403                         if (distance > cutoff) { break; }
 404
 405                         if (distance != -1) { //-1 means skip me
 406
 407                                 //are the distances the same
 408                                 if (distance == prevDistance) { //save in vector
 409                                         seqDist temp(itA->second, itB->second, distance);
 410                                         sameSeqs.push_back(temp);
 411                                         exitedBreak = false;
 412                                 }else{
 413                                         next.seq1 = itA->second;
 414                                         next.seq2 = itB->second;
 415                                         next.dist = distance;
 416                                         exitedBreak = true;
 417                                         break;
 418                                 }
 419                         }
 420                 }
 421
 422                 //rndomize matching dists
 423                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 424
 425                 return sameSeqs;
 426         }
 427         catch(exception& e) {
 428                 m->errorOut(e, "HCluster", "getSeqsFNNN");
 429                 exit(1);
 430         }
 431 }
 432 //**********************************************************************************************************************
 433 vector<seqDist> HCluster::getSeqsAN(){
 434         try {
 435                 int firstName, secondName;
 436                 float prevDistance;
 437                 vector<seqDist> sameSeqs;
 438                 prevDistance = -1;
 439
 440                 m->openInputFile(distfile, filehandle, "no error");
 441
 442                 //is the smallest value in mergedMin or the distfile?
 443                 float mergedMinDist = 10000;
 444                 float distance = 10000;
 445                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
 446
 447                 if (!filehandle.eof()) {
 448                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
 449                         //save first one
 450                         if (prevDistance == -1) { prevDistance = distance; }
 451                         if (distance != -1) { //-1 means skip me
 452                                 seqDist temp(firstName, secondName, distance);
 453                                 sameSeqs.push_back(temp);
 454                         }else{ distance = 10000; }
 455                 }
 456
 457                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
 458                         //remove distance we saved from file
 459                         sameSeqs.clear();
 460                         prevDistance = mergedMinDist;
 461
 462                         for (int i = 0; i < mergedMin.size(); i++) {
 463                                 if (mergedMin[i].dist == prevDistance) {
 464                                         sameSeqs.push_back(mergedMin[i]);
 465                                 }else { break; }
 466                         }
 467                 }else{ //get minimum from file
 468                         //get entry
 469                         while (!filehandle.eof()) {
 470
 471                                 filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
 472
 473                                 if (prevDistance == -1) { prevDistance = distance; }
 474
 475                                 if (distance != -1) { //-1 means skip me
 476                                         //are the distances the same
 477                                         if (distance == prevDistance) { //save in vector
 478                                                 seqDist temp(firstName, secondName, distance);
 479                                                 sameSeqs.push_back(temp);
 480                                         }else{
 481                                                 break;
 482                                         }
 483                                 }
 484                         }
 485                 }
 486                 filehandle.close();
 487
 488                 //randomize matching dists
 489                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
 490
 491                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
 492                 vector<seqDist> temp;
 493                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
 494
 495                 return temp;
 496         }
 497         catch(exception& e) {
 498                 m->errorOut(e, "HCluster", "getSeqsAN");
 499                 exit(1);
 500         }
 501 }
 502
 503 /***********************************************************************/
 504 int HCluster::combineFile() {
 505         try {
 506                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
 507                 //char* inputBuffer;
 508                 //inputBuffer = new char[bufferSize];
 509                 //size_t numRead;
 510
 511                 string tempDistFile = distfile + ".temp";
 512                 ofstream out;
 513                 m->openOutputFile(tempDistFile, out);
 514
 515                 //FILE* in;
 516                 //in = fopen(distfile.c_str(), "rb");
 517
 518                 ifstream in;
 519                 m->openInputFile(distfile, in, "no error");
 520
 521                 int first, second;
 522                 float dist;
 523
 524                 vector< map<int, float> > smallRowColValues;
 525                 smallRowColValues.resize(2);  //0 = row, 1 = col
 526                 int count = 0;
 527
 528                 //go through file pulling out distances related to rows merging
 529                 //if mergedMin contains distances add those back into file
 530                 //bool done = false;
 531                 //partialDist = "";
 532                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
 533 //cout << "number of char read = " << numRead << endl;
 534 //cout << inputBuffer << endl;
 535                         //if (numRead < bufferSize) { done = true; }
 536
 537                         //parse input into individual distances
 538                         //int spot = 0;
 539                         //string outputString = "";
 540                         //while(spot < numRead) {
 541         //cout << "spot = " << spot << endl;
 542                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
 543
 544                            //you read a partial distance
 545                           // if (nextDist.seq1 == -1) { break;  }
 546                         while (!in.eof()) {
 547                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
 548         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
 549                            //since file is sorted and mergedMin is sorted
 550                            //you can put the smallest distance from each through the code below and keep the file sorted
 551
 552                            in >> first >> second >> dist; m->gobble(in);
 553
 554                            if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
 555
 556                            //while there are still values in mergedMin that are smaller than the distance read from file
 557                            while (count < mergedMin.size())  {
 558
 559                                         //is the distance in mergedMin smaller than from the file
 560                                         if (mergedMin[count].dist < dist) {
 561                                         //is this a distance related to the columns merging?
 562                                         //if yes, save in memory
 563                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 564                                                 }else if (mergedMin[count].seq1 == smallCol) {
 565                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 566                                                 }else if (mergedMin[count].seq2 == smallCol) {
 567                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 568                                                 }else if (mergedMin[count].seq1 == smallRow) {
 569                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 570                                                 }else if (mergedMin[count].seq2 == smallRow) {
 571                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 572                                                 }else { //if no, write to temp file
 573                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
 574                                                         //if (mergedMin[count].dist < cutoff) {
 575                                                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 576                                                         //}
 577                                                 }
 578                                                 count++;
 579                                         }else{   break; }
 580                            }
 581
 582                            //is this a distance related to the columns merging?
 583                            //if yes, save in memory
 584                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
 585                            }else if (first == smallCol) {
 586                                         smallRowColValues[1][second] = dist;
 587                            }else if (second == smallCol) {
 588                                         smallRowColValues[1][first] = dist;
 589                            }else if (first == smallRow) {
 590                                         smallRowColValues[0][second] = dist;
 591                            }else if (second == smallRow) {
 592                                         smallRowColValues[0][first] = dist;
 593
 594                            }else { //if no, write to temp file
 595                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
 596                                    //if (dist < cutoff) {
 597                                            out << first << '\t' << second << '\t' << dist << endl;
 598                                    //}
 599                            }
 600                         }
 601
 602                         //out << outputString;
 603                         //if(done) { break; }
 604                 //}
 605                 //fclose(in);
 606                 in.close();
 607
 608                 //if values in mergedMin are larger than the the largest in file then
 609                 while (count < mergedMin.size())  {
 610                         //is this a distance related to the columns merging?
 611                         //if yes, save in memory
 612                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
 613                         }else if (mergedMin[count].seq1 == smallCol) {
 614                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
 615                         }else if (mergedMin[count].seq2 == smallCol) {
 616                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
 617                         }else if (mergedMin[count].seq1 == smallRow) {
 618                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
 619                         }else if (mergedMin[count].seq2 == smallRow) {
 620                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
 621
 622                         }else { //if no, write to temp file
 623                                 //if (mergedMin[count].dist < cutoff) {
 624                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
 625                                 //}
 626                         }
 627                         count++;
 628                 }
 629                 out.close();
 630                 mergedMin.clear();
 631
 632                 //rename tempfile to distfile
 633                 remove(distfile.c_str());
 634                 rename(tempDistFile.c_str(), distfile.c_str());
 635 //cout << "remove = "<< renameOK << " rename = " << ok << endl;
 636
 637                 //merge clustered rows averaging the distances
 638                 map<int, float>::iterator itMerge;
 639                 map<int, float>::iterator it2Merge;
 640                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
 641                         //does smallRowColValues[1] have a distance to this seq too?
 642                         it2Merge = smallRowColValues[1].find(itMerge->first);
 643
 644                         float average;
 645                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
 646                                 //average
 647                                 if (method == "average") {
 648                                         int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
 649                                         average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
 650                                 }else { //weighted
 651                                         average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;
 652                                 }
 653
 654                                 smallRowColValues[1].erase(it2Merge);
 655
 656                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
 657                                 mergedMin.push_back(temp);
 658                         }else {
 659                                 //can't find value so update cutoff
 660                                 if (cutoff > itMerge->second) { cutoff = itMerge->second; }
 661                         }
 662                 }
 663
 664                 //update cutoff
 665                 for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) {
 666                         if (cutoff > itMerge->second) { cutoff = itMerge->second; }
 667                 }
 668
 669                 //sort merged values
 670                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
 671
 672                 return 0;
 673         }
 674         catch(exception& e) {
 675                 m->errorOut(e, "HCluster", "combineFile");
 676                 exit(1);
 677         }
 678 }
 679 /***********************************************************************
 680 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
 681         try {
 682                 seqDist next;
 683                 int indexBefore = index;
 684                 string first, second, distance;
 685                 first = ""; second = ""; distance = "";
 686                 int tabCount = 0;
 687 //cout << "partial = " << partialDist << endl;
 688                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
 689                         for (int i = 0; i < partialDist.size(); i++) {
 690                                 if (tabCount == 0) {
 691                                         if (partialDist[i] == '\t') { tabCount++; }
 692                                         else {  first += partialDist[i];        }
 693                                 }else if (tabCount == 1) {
 694                                         if (partialDist[i] == '\t') { tabCount++; }
 695                                         else {  second += partialDist[i];       }
 696                                 }else if (tabCount == 2) {
 697                                         distance +=  partialDist[i];
 698                                 }
 699                         }
 700                         partialDist = "";
 701                 }
 702
 703                 //try to get another distance
 704                 bool gotDist = false;
 705                 while (index < size) {
 706                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
 707                                 gotDist = true;
 708
 709                                 //m->gobble space
 710                                 while (index < size) {
 711                                         if (isspace(buffer[index])) { index++; }
 712                                         else { break; }
 713                                 }
 714                                 break;
 715                         }else{
 716                                 if (tabCount == 0) {
 717                                         if (buffer[index] == '\t') { tabCount++; }
 718                                         else {  first += buffer[index]; }
 719                                 }else if (tabCount == 1) {
 720                                         if (buffer[index] == '\t') { tabCount++; }
 721                                         else {  second += buffer[index];        }
 722                                 }else if (tabCount == 2) {
 723                                         distance +=  buffer[index];
 724                                 }
 725                                 index++;
 726                         }
 727                 }
 728
 729                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
 730                 //then you want to save the partial distance.
 731                 if (!gotDist) {
 732                         for (int i = indexBefore; i < size; i++) {
 733                                 partialDist += buffer[i];
 734                         }
 735                         index = size + 1;
 736                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
 737                 }else{
 738                         int firstname, secondname;
 739                         float dist;
 740
 741                         convert(first, firstname);
 742                         convert(second, secondname);
 743                         convert(distance, dist);
 744
 745                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
 746                 }
 747
 748                 return next;
 749         }
 750         catch(exception& e) {
 751                 m->errorOut(e, "HCluster", "getNextDist");
 752                 exit(1);
 753         }
 754 }
 755 /***********************************************************************/
 756 int HCluster::processFile() {
 757         try {
 758                 string firstName, secondName;
 759                 float distance;
 760
 761                 ifstream in;
 762                 m->openInputFile(distfile, in, "no error");
 763
 764                 ofstream out;
 765                 string outTemp = distfile + ".temp";
 766                 m->openOutputFile(outTemp, out);
 767
 768                 //get entry
 769                 while (!in.eof()) {
 770                         if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; }
 771
 772                         in >> firstName >> secondName >> distance;    m->gobble(in);
 773
 774                         map<string,int>::iterator itA = nameMap->find(firstName);
 775                         map<string,int>::iterator itB = nameMap->find(secondName);
 776                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
 777                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 778
 779                         //using cutoff
 780                         if (distance > cutoff) { break; }
 781
 782                         if (distance != -1) { //-1 means skip me
 783                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
 784                         }
 785                 }
 786
 787                 in.close();
 788                 out.close();
 789
 790                 remove(distfile.c_str());
 791                 rename(outTemp.c_str(), distfile.c_str());
 792
 793                 return 0;
 794         }
 795         catch(exception& e) {
 796                 m->errorOut(e, "HCluster", "processFile");
 797                 exit(1);
 798         }
 799 }
 800 /***********************************************************************/
 801
 802
 803
 804
 805
 806
 807
 808