]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
moved utilities out of mothur.h and into mothurOut class.
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
17         try {
18                 m = MothurOut::getInstance();
19                 mapWanted = false;
20                 exitedBreak = false; 
21                 numSeqs = list->getNumSeqs();
22                 
23                 //initialize cluster array
24                 for (int i = 0; i < numSeqs; i++) {
25                         clusterNode temp(1, -1, i);
26                         clusterArray.push_back(temp);
27                 }
28                 
29                 if (method != "average") {
30                         m->openInputFile(distfile, filehandle);
31                 }else{  
32                         processFile();  
33                 }
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "HCluster", "HCluster");
37                 exit(1);
38         }
39 }
40 /***********************************************************************/
41
42 void HCluster::clusterBins(){
43         try {
44                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
45
46                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
47                 rabund->set(clusterArray[smallRow].smallChild, 0);      
48                 rabund->setLabel(toString(smallDist));
49
50                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
51         }
52         catch(exception& e) {
53                 m->errorOut(e, "HCluster", "clusterBins");
54                 exit(1);
55         }
56
57
58 }
59
60 /***********************************************************************/
61
62 void HCluster::clusterNames(){
63         try {
64                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
65                 if (mapWanted) {  updateMap();  }
66                 
67                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
68                 list->set(clusterArray[smallRow].smallChild, "");       
69                 list->setLabel(toString(smallDist));
70         
71                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
72
73     }
74         catch(exception& e) {
75                 m->errorOut(e, "HCluster", "clusterNames");
76                 exit(1);
77         }
78
79 }
80 /***********************************************************************/
81 int HCluster::getUpmostParent(int node){
82         try {
83                 while (clusterArray[node].parent != -1) {
84                         node = clusterArray[node].parent;
85                 }
86                 
87                 return node;
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "HCluster", "getUpmostParent");
91                 exit(1);
92         }
93 }
94 /***********************************************************************/
95 void HCluster::printInfo(){
96         try {
97                 
98                 cout << "link table" << endl;
99                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
100                         cout << itActive->first << " = " << itActive->second << endl;
101                 }
102                 cout << endl;
103                 for (int i = 0; i < linkTable.size(); i++) {
104                         cout << i << '\t';
105                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
106                                 cout << it->first << '-' << it->second << '\t' ;
107                         }
108                         cout << endl;
109                 }
110                 cout << endl << "clusterArray" << endl;
111                 
112                 for (int i = 0; i < clusterArray.size(); i++) {
113                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
114                 }
115                 cout << endl;
116                 
117                 
118         }
119         catch(exception& e) {
120                 m->errorOut(e, "HCluster", "getUpmostParent");
121                 exit(1);
122         }
123 }
124 /***********************************************************************/
125 int HCluster::makeActive() {
126         try {
127                 int linkValue = 1; 
128
129                 itActive = activeLinks.find(smallRow);
130                 it2Active = activeLinks.find(smallCol);
131                 
132                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
133                         int size = linkTable.size();
134                         map<int, int> temp; map<int, int> temp2;
135                         
136                         //add link to eachother
137                         temp[smallRow] = 1;                                                     //         1    2                                                               
138                         temp2[smallCol] = 1;                                            // 1   0        1 
139                                                                                                                 // 2   1        0                               
140                         linkTable.push_back(temp);
141                         linkTable.push_back(temp2);
142                         
143                         //add to activeLinks
144                         activeLinks[smallRow] = size;
145                         activeLinks[smallCol] = size+1;
146
147                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
148                          int size = linkTable.size();
149                          int alreadyActiveRow = itActive->second;
150                          map<int, int> temp; 
151                         
152                         //add link to eachother
153                         temp[smallRow] = 1;                                                                     //         6    2       3       5
154                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
155                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
156                                                                                                                                 // 3   2        1       0       0
157                                                                                                                                 // 5   0    1   0   0   
158                         //add to activeLinks
159                         activeLinks[smallCol] = size;
160         
161                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
162                          int size = linkTable.size();
163                          int alreadyActiveCol = it2Active->second;
164                          map<int, int> temp; 
165                         
166                         //add link to eachother
167                         temp[smallCol] = 1;                                                                     //         6    2       3       5
168                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
169                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
170                                                                                                                                 // 3   2        1       0       0
171                                                                                                                                 // 5   0    1   0   0   
172                         //add to activeLinks
173                         activeLinks[smallRow] = size;
174
175                 }else { //both are active so add one
176                         int row = itActive->second;
177                         int col = it2Active->second;
178                         
179                         
180                         linkTable[row][smallCol]++;
181                         linkTable[col][smallRow]++;
182                         linkValue = linkTable[row][smallCol];
183                 }
184                 
185                 return linkValue;
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "HCluster", "makeActive");
189                 exit(1);
190         }
191 }
192 /***********************************************************************/
193 void HCluster::updateArrayandLinkTable() {
194         try {
195                 //if cluster was made update clusterArray and linkTable
196                 int size = clusterArray.size();
197                 
198                 //add new node
199                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
200                 clusterArray.push_back(temp);
201                 
202                 //update child nodes
203                 clusterArray[smallRow].parent = size;
204                 clusterArray[smallCol].parent = size;
205                 
206                 if (method == "furthest") {
207                         
208                         //update linkTable by merging clustered rows and columns
209                         int rowSpot = activeLinks[smallRow];
210                         int colSpot = activeLinks[smallCol];
211                         
212                         //fix old rows
213                         for (int i = 0; i < linkTable.size(); i++) {
214                                 //check if they are in map
215                                 it = linkTable[i].find(smallRow);
216                                 it2 = linkTable[i].find(smallCol);
217                                 
218                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
219                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
220                                         linkTable[i].erase(smallCol); //delete col row
221                                         linkTable[i].erase(smallRow); //delete col row
222                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
223                                         linkTable[i][size] = linkTable[i][smallCol];
224                                         linkTable[i].erase(smallCol); //delete col 
225                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
226                                         linkTable[i][size] = linkTable[i][smallRow];
227                                         linkTable[i].erase(smallRow); //delete col 
228                                 }
229                         }
230                         
231                         //merge their values
232                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
233                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
234                                 
235                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
236                                         linkTable[colSpot][it->first] = it->second;
237                                 }else { //merge them
238                                         linkTable[colSpot][it->first] = it->second + it2->second;
239                                 }
240                         }
241                         
242                         linkTable[colSpot].erase(size);
243                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
244                         
245                         //update activerows
246                         activeLinks.erase(smallRow);
247                         activeLinks.erase(smallCol);
248                         activeLinks[size] = colSpot;
249                         
250                         //adjust everybody elses spot since you deleted - time vs. space
251                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
252                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
253                         }
254                 }
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
258                 exit(1);
259         }
260 }
261 /***********************************************************************/
262 bool HCluster::update(int row, int col, float distance){
263         try {
264                 bool cluster = false;
265                 smallRow = row;
266                 smallCol = col;
267                 smallDist = distance;
268                 
269                 //find upmost parent of row and col
270                 smallRow = getUpmostParent(smallRow);
271                 smallCol = getUpmostParent(smallCol);
272
273                 //you don't want to cluster with yourself
274                 if (smallRow != smallCol) {
275                         
276                         if (method != "average") {
277                                 //can we cluster???
278                                 if (method == "nearest") { cluster = true;  }
279                                 else{ //assume furthest
280                                         //are they active in the link table
281                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
282                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
283                                 }
284                                 
285                                 if (cluster) { 
286                                         updateArrayandLinkTable();
287                                         clusterBins();
288                                         clusterNames();
289                                 }
290                         }else {
291                                 cluster = true;
292                                 updateArrayandLinkTable();
293                                 clusterBins();
294                                 clusterNames();
295                                 combineFile();
296                         }
297                 }
298                 
299                 return cluster;
300                 //printInfo();
301         }
302         catch(exception& e) {
303                 m->errorOut(e, "HCluster", "update");
304                 exit(1);
305         }
306 }
307 /***********************************************************************/
308 void HCluster::setMapWanted(bool ms)  {  
309         try {
310                 mapWanted = ms;
311                 
312                 //initialize map
313                 for (int i = 0; i < list->getNumBins(); i++) {
314                         
315                         //parse bin 
316                         string names = list->get(i);
317                         while (names.find_first_of(',') != -1) { 
318                                 //get name from bin
319                                 string name = names.substr(0,names.find_first_of(','));
320                                 //save name and bin number
321                                 seq2Bin[name] = i;
322                                 names = names.substr(names.find_first_of(',')+1, names.length());
323                         }
324                         
325                         //get last name
326                         seq2Bin[names] = i;
327                 }
328                 
329         }
330         catch(exception& e) {
331                 m->errorOut(e, "HCluster", "setMapWanted");
332                 exit(1);
333         }
334 }
335 /***********************************************************************/
336 void HCluster::updateMap() {
337 try {
338                 //update location of seqs in smallRow since they move to smallCol now
339                 string names = list->get(clusterArray[smallRow].smallChild);
340                 while (names.find_first_of(',') != -1) { 
341                         //get name from bin
342                         string name = names.substr(0,names.find_first_of(','));
343                         //save name and bin number
344                         seq2Bin[name] = clusterArray[smallCol].smallChild;
345                         names = names.substr(names.find_first_of(',')+1, names.length());
346                 }
347                         
348                 //get last name
349                 seq2Bin[names] = clusterArray[smallCol].smallChild;
350         }
351         catch(exception& e) {
352                 m->errorOut(e, "HCluster", "updateMap");
353                 exit(1);
354         }
355 }
356 //**********************************************************************************************************************
357 vector<seqDist> HCluster::getSeqs(){
358         try {
359                 vector<seqDist> sameSeqs;
360                 
361                 if(method != "average") {
362                         sameSeqs = getSeqsFNNN();
363                 }else{
364                         sameSeqs = getSeqsAN(); 
365                 }
366                                 
367                 return sameSeqs;
368         }
369         catch(exception& e) {
370                 m->errorOut(e, "HCluster", "getSeqs");
371                 exit(1);
372         }
373 }
374 //**********************************************************************************************************************
375 vector<seqDist> HCluster::getSeqsFNNN(){
376         try {
377                 string firstName, secondName;
378                 float distance, prevDistance;
379                 vector<seqDist> sameSeqs;
380                 prevDistance = -1;
381         
382                 //if you are not at the beginning of the file
383                 if (exitedBreak) { 
384                         sameSeqs.push_back(next);
385                         prevDistance = next.dist;
386                         exitedBreak = false;
387                 }
388         
389                 //get entry
390                 while (!filehandle.eof()) {
391                         
392                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
393         
394                         //save first one
395                         if (prevDistance == -1) { prevDistance = distance; }
396                         
397                         map<string,int>::iterator itA = nameMap->find(firstName);
398                         map<string,int>::iterator itB = nameMap->find(secondName);
399                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
400                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
401                 
402                         //using cutoff
403                         if (distance > cutoff) { break; }
404                 
405                         if (distance != -1) { //-1 means skip me
406                         
407                                 //are the distances the same
408                                 if (distance == prevDistance) { //save in vector
409                                         seqDist temp(itA->second, itB->second, distance);
410                                         sameSeqs.push_back(temp);
411                                         exitedBreak = false;
412                                 }else{ 
413                                         next.seq1 = itA->second;
414                                         next.seq2 = itB->second;
415                                         next.dist = distance;
416                                         exitedBreak = true;
417                                         break;
418                                 }
419                         }
420                 }
421                 
422                 //rndomize matching dists
423                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
424                 
425                 return sameSeqs;
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "HCluster", "getSeqsFNNN");
429                 exit(1);
430         }
431 }
432 //**********************************************************************************************************************
433 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
434 vector<seqDist> HCluster::getSeqsAN(){
435         try {
436                 int firstName, secondName;
437                 float prevDistance;
438                 vector<seqDist> sameSeqs;
439                 prevDistance = -1;
440                 
441                 m->openInputFile(distfile, filehandle, "no error"); 
442                 
443                 //is the smallest value in mergedMin or the distfile?
444                 float mergedMinDist = 10000;
445                 float distance = 10000;
446                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
447                         
448                 if (!filehandle.eof()) {  
449                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
450                         //save first one
451                         if (prevDistance == -1) { prevDistance = distance; } 
452                         if (distance != -1) { //-1 means skip me
453                                 seqDist temp(firstName, secondName, distance);
454                                 sameSeqs.push_back(temp);
455                         }else{ distance = 10000; }
456                 }
457                 
458                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
459                         //remove distance we saved from file
460                         sameSeqs.clear();
461                         prevDistance = mergedMinDist;
462                         
463                         for (int i = 0; i < mergedMin.size(); i++) {
464                                 if (mergedMin[i].dist == prevDistance) {
465                                         sameSeqs.push_back(mergedMin[i]);
466                                 }else { break; }
467                         }
468                 }else{ //get minimum from file
469                         //get entry
470                         while (!filehandle.eof()) {
471                                 
472                                 filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
473                                 
474                                 if (prevDistance == -1) { prevDistance = distance; }
475                                 
476                                 if (distance != -1) { //-1 means skip me
477                                         //are the distances the same
478                                         if (distance == prevDistance) { //save in vector
479                                                 seqDist temp(firstName, secondName, distance);
480                                                 sameSeqs.push_back(temp);
481                                         }else{  
482                                                 break;
483                                         }
484                                 }
485                         }
486                 }
487                 filehandle.close();
488                 
489                 //randomize matching dists
490                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
491                 
492                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
493                 vector<seqDist> temp;
494                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
495                 
496                 return temp;
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "HCluster", "getSeqsAN");
500                 exit(1);
501         }
502 }
503
504 /***********************************************************************/
505 int HCluster::combineFile() {
506         try {
507                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
508                 //char* inputBuffer;
509                 //inputBuffer = new char[bufferSize];
510                 //size_t numRead;
511                 
512                 string tempDistFile = distfile + ".temp";
513                 ofstream out;
514                 m->openOutputFile(tempDistFile, out);
515                 
516                 //FILE* in;
517                 //in = fopen(distfile.c_str(), "rb");
518         
519                 ifstream in;
520                 m->openInputFile(distfile, in);
521                 
522                 int first, second;
523                 float dist;
524                 
525                 vector< map<int, float> > smallRowColValues;
526                 smallRowColValues.resize(2);  //0 = row, 1 = col
527                 int count = 0;
528                                 
529                 //go through file pulling out distances related to rows merging
530                 //if mergedMin contains distances add those back into file
531                 //bool done = false;
532                 //partialDist = "";
533                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
534 //cout << "number of char read = " << numRead << endl;
535 //cout << inputBuffer << endl;
536                         //if (numRead < bufferSize) { done = true; }
537                         
538                         //parse input into individual distances
539                         //int spot = 0;
540                         //string outputString = "";
541                         //while(spot < numRead) {
542         //cout << "spot = " << spot << endl;
543                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
544                            
545                            //you read a partial distance
546                           // if (nextDist.seq1 == -1) { break;  }
547                         while (!in.eof()) {
548                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
549         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
550                            //since file is sorted and mergedMin is sorted 
551                            //you can put the smallest distance from each through the code below and keep the file sorted
552                            
553                            in >> first >> second >> dist; m->gobble(in);
554                            
555                            if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
556                            
557                            //while there are still values in mergedMin that are smaller than the distance read from file
558                            while (count < mergedMin.size())  {
559                            
560                                         //is the distance in mergedMin smaller than from the file
561                                         if (mergedMin[count].dist < dist) {
562                                         //is this a distance related to the columns merging?
563                                         //if yes, save in memory
564                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
565                                                 }else if (mergedMin[count].seq1 == smallCol) {
566                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
567                                                 }else if (mergedMin[count].seq2 == smallCol) {
568                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
569                                                 }else if (mergedMin[count].seq1 == smallRow) {
570                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
571                                                 }else if (mergedMin[count].seq2 == smallRow) {
572                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
573                                                 }else { //if no, write to temp file
574                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
575                                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
576                                                 }
577                                                 count++;
578                                         }else{   break; }
579                            }
580                            
581                            //is this a distance related to the columns merging?
582                            //if yes, save in memory
583                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
584                            }else if (first == smallCol) {
585                                         smallRowColValues[1][second] = dist;
586                            }else if (second == smallCol) {
587                                         smallRowColValues[1][first] = dist;
588                            }else if (first == smallRow) {
589                                         smallRowColValues[0][second] = dist;
590                            }else if (second == smallRow) {
591                                         smallRowColValues[0][first] = dist;
592                            
593                            }else { //if no, write to temp file
594                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
595                                         out << first << '\t' << second << '\t' << dist << endl;
596                            }
597                         }
598                         
599                         //out << outputString;
600                         //if(done) { break; }
601                 //}
602                 //fclose(in);
603                 in.close();
604                 
605                 //if values in mergedMin are larger than the the largest in file then
606                 while (count < mergedMin.size())  {  
607                         //is this a distance related to the columns merging?
608                         //if yes, save in memory
609                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
610                         }else if (mergedMin[count].seq1 == smallCol) {
611                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
612                         }else if (mergedMin[count].seq2 == smallCol) {
613                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
614                         }else if (mergedMin[count].seq1 == smallRow) {
615                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
616                         }else if (mergedMin[count].seq2 == smallRow) {
617                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
618                                 
619                         }else { //if no, write to temp file
620                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
621                         }
622                         count++;
623                 }
624                 out.close();
625                 mergedMin.clear();
626                         
627                 //rename tempfile to distfile
628                 remove(distfile.c_str());
629                 rename(tempDistFile.c_str(), distfile.c_str());
630 //cout << "remove = "<< renameOK << " rename = " << ok << endl; 
631
632                 //merge clustered rows averaging the distances
633                 map<int, float>::iterator itMerge;
634                 map<int, float>::iterator it2Merge;
635                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
636                         //does smallRowColValues[1] have a distance to this seq too?
637                         it2Merge = smallRowColValues[1].find(itMerge->first);
638                         
639                         float average;
640                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
641                                 //weighted average
642                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
643                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
644                                 smallRowColValues[1].erase(it2Merge);
645                                 
646                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
647                                 mergedMin.push_back(temp);
648                         }
649                 }
650
651                 //sort merged values
652                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
653                 
654                 return 0;
655         }
656         catch(exception& e) {
657                 m->errorOut(e, "HCluster", "combineFile");
658                 exit(1);
659         }
660 }
661 /***********************************************************************
662 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
663         try {
664                 seqDist next;
665                 int indexBefore = index;
666                 string first, second, distance;
667                 first = ""; second = ""; distance = "";
668                 int tabCount = 0;
669 //cout << "partial = " << partialDist << endl;          
670                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
671                         for (int i = 0; i < partialDist.size(); i++) {
672                                 if (tabCount == 0) {
673                                         if (partialDist[i] == '\t') { tabCount++; }
674                                         else {  first += partialDist[i];        }
675                                 }else if (tabCount == 1) {
676                                         if (partialDist[i] == '\t') { tabCount++; }
677                                         else {  second += partialDist[i];       }
678                                 }else if (tabCount == 2) {
679                                         distance +=  partialDist[i];
680                                 }
681                         }
682                         partialDist = "";
683                 }
684         
685                 //try to get another distance
686                 bool gotDist = false;
687                 while (index < size) {
688                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
689                                 gotDist = true;
690                                 
691                                 //m->gobble space
692                                 while (index < size) {          
693                                         if (isspace(buffer[index])) { index++; }
694                                         else { break; }         
695                                 }
696                                 break;
697                         }else{
698                                 if (tabCount == 0) {
699                                         if (buffer[index] == '\t') { tabCount++; }
700                                         else {  first += buffer[index]; }
701                                 }else if (tabCount == 1) {
702                                         if (buffer[index] == '\t') { tabCount++; }
703                                         else {  second += buffer[index];        }
704                                 }else if (tabCount == 2) {
705                                         distance +=  buffer[index];
706                                 }
707                                 index++;
708                         }
709                 }
710                 
711                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
712                 //then you want to save the partial distance.
713                 if (!gotDist) {
714                         for (int i = indexBefore; i < size; i++) {
715                                 partialDist += buffer[i];
716                         }
717                         index = size + 1;
718                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
719                 }else{
720                         int firstname, secondname;
721                         float dist;
722                         
723                         convert(first, firstname);
724                         convert(second, secondname);
725                         convert(distance, dist);
726                         
727                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
728                 }
729                                                 
730                 return next;
731         }
732         catch(exception& e) {
733                 m->errorOut(e, "HCluster", "getNextDist");
734                 exit(1);
735         }
736 }
737 /***********************************************************************/
738 int HCluster::processFile() {
739         try {
740                 string firstName, secondName;
741                 float distance;
742                 
743                 ifstream in;
744                 m->openInputFile(distfile, in);
745                 
746                 ofstream out;
747                 string outTemp = distfile + ".temp";
748                 m->openOutputFile(outTemp, out);
749         
750                 //get entry
751                 while (!in.eof()) {
752                         if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; }
753                         
754                         in >> firstName >> secondName >> distance;    m->gobble(in);            
755                         
756                         map<string,int>::iterator itA = nameMap->find(firstName);
757                         map<string,int>::iterator itB = nameMap->find(secondName);
758                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
759                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
760                 
761                         //using cutoff
762                         if (distance > cutoff) { break; }
763                 
764                         if (distance != -1) { //-1 means skip me
765                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
766                         }
767                 }
768                 
769                 in.close();
770                 out.close();
771                 
772                 remove(distfile.c_str());
773                 rename(outTemp.c_str(), distfile.c_str());
774                 
775                 return 0;
776         }
777         catch(exception& e) {
778                 m->errorOut(e, "HCluster", "processFile");
779                 exit(1);
780         }
781 }
782 /***********************************************************************/
783
784
785
786
787
788
789
790