]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
added out.hierarchy command
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
17         try {
18                 mapWanted = false;
19                 exitedBreak = false; 
20                 numSeqs = list->getNumSeqs();
21                 
22                 //initialize cluster array
23                 for (int i = 0; i < numSeqs; i++) {
24                         clusterNode temp(1, -1, i);
25                         clusterArray.push_back(temp);
26                 }
27                 
28                 if (method != "average") {
29                         openInputFile(distfile, filehandle);
30                 }else{  
31                         processFile();  
32                 }
33         }
34         catch(exception& e) {
35                 errorOut(e, "HCluster", "HCluster");
36                 exit(1);
37         }
38 }
39 /***********************************************************************/
40
41 void HCluster::clusterBins(){
42         try {
43                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
44
45                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
46                 rabund->set(clusterArray[smallRow].smallChild, 0);      
47                 rabund->setLabel(toString(smallDist));
48
49                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
50         }
51         catch(exception& e) {
52                 errorOut(e, "HCluster", "clusterBins");
53                 exit(1);
54         }
55
56
57 }
58
59 /***********************************************************************/
60
61 void HCluster::clusterNames(){
62         try {
63                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
64                 if (mapWanted) {  updateMap();  }
65                 
66                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
67                 list->set(clusterArray[smallRow].smallChild, "");       
68                 list->setLabel(toString(smallDist));
69         
70                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
71
72     }
73         catch(exception& e) {
74                 errorOut(e, "HCluster", "clusterNames");
75                 exit(1);
76         }
77
78 }
79 /***********************************************************************/
80 int HCluster::getUpmostParent(int node){
81         try {
82                 while (clusterArray[node].parent != -1) {
83                         node = clusterArray[node].parent;
84                 }
85                 
86                 return node;
87         }
88         catch(exception& e) {
89                 errorOut(e, "HCluster", "getUpmostParent");
90                 exit(1);
91         }
92 }
93 /***********************************************************************/
94 void HCluster::printInfo(){
95         try {
96                 
97                 cout << "link table" << endl;
98                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
99                         cout << itActive->first << " = " << itActive->second << endl;
100                 }
101                 cout << endl;
102                 for (int i = 0; i < linkTable.size(); i++) {
103                         cout << i << '\t';
104                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
105                                 cout << it->first << '-' << it->second << '\t' ;
106                         }
107                         cout << endl;
108                 }
109                 cout << endl << "clusterArray" << endl;
110                 
111                 for (int i = 0; i < clusterArray.size(); i++) {
112                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
113                 }
114                 cout << endl;
115                 
116                 
117         }
118         catch(exception& e) {
119                 errorOut(e, "HCluster", "getUpmostParent");
120                 exit(1);
121         }
122 }
123 /***********************************************************************/
124 int HCluster::makeActive() {
125         try {
126                 int linkValue = 1; 
127
128                 itActive = activeLinks.find(smallRow);
129                 it2Active = activeLinks.find(smallCol);
130                 
131                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
132                         int size = linkTable.size();
133                         map<int, int> temp; map<int, int> temp2;
134                         
135                         //add link to eachother
136                         temp[smallRow] = 1;                                                     //         1    2                                                               
137                         temp2[smallCol] = 1;                                            // 1   0        1 
138                                                                                                                 // 2   1        0                               
139                         linkTable.push_back(temp);
140                         linkTable.push_back(temp2);
141                         
142                         //add to activeLinks
143                         activeLinks[smallRow] = size;
144                         activeLinks[smallCol] = size+1;
145
146                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
147                          int size = linkTable.size();
148                          int alreadyActiveRow = itActive->second;
149                          map<int, int> temp; 
150                         
151                         //add link to eachother
152                         temp[smallRow] = 1;                                                                     //         6    2       3       5
153                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
154                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
155                                                                                                                                 // 3   2        1       0       0
156                                                                                                                                 // 5   0    1   0   0   
157                         //add to activeLinks
158                         activeLinks[smallCol] = size;
159         
160                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
161                          int size = linkTable.size();
162                          int alreadyActiveCol = it2Active->second;
163                          map<int, int> temp; 
164                         
165                         //add link to eachother
166                         temp[smallCol] = 1;                                                                     //         6    2       3       5
167                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
168                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
169                                                                                                                                 // 3   2        1       0       0
170                                                                                                                                 // 5   0    1   0   0   
171                         //add to activeLinks
172                         activeLinks[smallRow] = size;
173
174                 }else { //both are active so add one
175                         int row = itActive->second;
176                         int col = it2Active->second;
177                         
178                         
179                         linkTable[row][smallCol]++;
180                         linkTable[col][smallRow]++;
181                         linkValue = linkTable[row][smallCol];
182                 }
183                 
184                 return linkValue;
185         }
186         catch(exception& e) {
187                 errorOut(e, "HCluster", "makeActive");
188                 exit(1);
189         }
190 }
191 /***********************************************************************/
192 void HCluster::updateArrayandLinkTable() {
193         try {
194                 //if cluster was made update clusterArray and linkTable
195                 int size = clusterArray.size();
196                 
197                 //add new node
198                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
199                 clusterArray.push_back(temp);
200                 
201                 //update child nodes
202                 clusterArray[smallRow].parent = size;
203                 clusterArray[smallCol].parent = size;
204                 
205                 if (method == "furthest") {
206                         
207                         //update linkTable by merging clustered rows and columns
208                         int rowSpot = activeLinks[smallRow];
209                         int colSpot = activeLinks[smallCol];
210                         
211                         //fix old rows
212                         for (int i = 0; i < linkTable.size(); i++) {
213                                 //check if they are in map
214                                 it = linkTable[i].find(smallRow);
215                                 it2 = linkTable[i].find(smallCol);
216                                 
217                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
218                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
219                                         linkTable[i].erase(smallCol); //delete col row
220                                         linkTable[i].erase(smallRow); //delete col row
221                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
222                                         linkTable[i][size] = linkTable[i][smallCol];
223                                         linkTable[i].erase(smallCol); //delete col 
224                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
225                                         linkTable[i][size] = linkTable[i][smallRow];
226                                         linkTable[i].erase(smallRow); //delete col 
227                                 }
228                         }
229                         
230                         //merge their values
231                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
232                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
233                                 
234                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
235                                         linkTable[colSpot][it->first] = it->second;
236                                 }else { //merge them
237                                         linkTable[colSpot][it->first] = it->second + it2->second;
238                                 }
239                         }
240                         
241                         linkTable[colSpot].erase(size);
242                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
243                         
244                         //update activerows
245                         activeLinks.erase(smallRow);
246                         activeLinks.erase(smallCol);
247                         activeLinks[size] = colSpot;
248                         
249                         //adjust everybody elses spot since you deleted - time vs. space
250                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
251                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
252                         }
253                 }
254         }
255         catch(exception& e) {
256                 errorOut(e, "HCluster", "updateArrayandLinkTable");
257                 exit(1);
258         }
259 }
260 /***********************************************************************/
261 bool HCluster::update(int row, int col, float distance){
262         try {
263                 bool cluster = false;
264                 smallRow = row;
265                 smallCol = col;
266                 smallDist = distance;
267                 
268                 //find upmost parent of row and col
269                 smallRow = getUpmostParent(smallRow);
270                 smallCol = getUpmostParent(smallCol);
271
272                 //you don't want to cluster with yourself
273                 if (smallRow != smallCol) {
274                         
275                         if (method != "average") {
276                                 //can we cluster???
277                                 if (method == "nearest") { cluster = true;  }
278                                 else{ //assume furthest
279                                         //are they active in the link table
280                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
281                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
282                                 }
283                                 
284                                 if (cluster) { 
285                                         updateArrayandLinkTable();
286                                         clusterBins();
287                                         clusterNames();
288                                 }
289                         }else {
290                                 cluster = true;
291                                 updateArrayandLinkTable();
292                                 clusterBins();
293                                 clusterNames();
294                                 combineFile();
295                         }
296                 }
297                 
298                 return cluster;
299                 //printInfo();
300         }
301         catch(exception& e) {
302                 errorOut(e, "HCluster", "update");
303                 exit(1);
304         }
305 }
306 /***********************************************************************/
307 void HCluster::setMapWanted(bool m)  {  
308         try {
309                 mapWanted = m;
310                 
311                 //initialize map
312                 for (int i = 0; i < list->getNumBins(); i++) {
313                         
314                         //parse bin 
315                         string names = list->get(i);
316                         while (names.find_first_of(',') != -1) { 
317                                 //get name from bin
318                                 string name = names.substr(0,names.find_first_of(','));
319                                 //save name and bin number
320                                 seq2Bin[name] = i;
321                                 names = names.substr(names.find_first_of(',')+1, names.length());
322                         }
323                         
324                         //get last name
325                         seq2Bin[names] = i;
326                 }
327                 
328         }
329         catch(exception& e) {
330                 errorOut(e, "HCluster", "setMapWanted");
331                 exit(1);
332         }
333 }
334 /***********************************************************************/
335 void HCluster::updateMap() {
336 try {
337                 //update location of seqs in smallRow since they move to smallCol now
338                 string names = list->get(clusterArray[smallRow].smallChild);
339                 while (names.find_first_of(',') != -1) { 
340                         //get name from bin
341                         string name = names.substr(0,names.find_first_of(','));
342                         //save name and bin number
343                         seq2Bin[name] = clusterArray[smallCol].smallChild;
344                         names = names.substr(names.find_first_of(',')+1, names.length());
345                 }
346                         
347                 //get last name
348                 seq2Bin[names] = clusterArray[smallCol].smallChild;
349         }
350         catch(exception& e) {
351                 errorOut(e, "HCluster", "updateMap");
352                 exit(1);
353         }
354 }
355 //**********************************************************************************************************************
356 vector<seqDist> HCluster::getSeqs(){
357         try {
358                 vector<seqDist> sameSeqs;
359                 
360                 if(method != "average") {
361                         sameSeqs = getSeqsFNNN();
362                 }else{
363                         sameSeqs = getSeqsAN(); 
364                 }
365                                 
366                 return sameSeqs;
367         }
368         catch(exception& e) {
369                 errorOut(e, "HCluster", "getSeqs");
370                 exit(1);
371         }
372 }
373 //**********************************************************************************************************************
374 vector<seqDist> HCluster::getSeqsFNNN(){
375         try {
376                 string firstName, secondName;
377                 float distance, prevDistance;
378                 vector<seqDist> sameSeqs;
379                 prevDistance = -1;
380         
381                 //if you are not at the beginning of the file
382                 if (exitedBreak) { 
383                         sameSeqs.push_back(next);
384                         prevDistance = next.dist;
385                         exitedBreak = false;
386                 }
387         
388                 //get entry
389                 while (!filehandle.eof()) {
390                         
391                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
392         
393                         //save first one
394                         if (prevDistance == -1) { prevDistance = distance; }
395                         
396                         map<string,int>::iterator itA = nameMap->find(firstName);
397                         map<string,int>::iterator itB = nameMap->find(secondName);
398                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
399                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
400                 
401                         //using cutoff
402                         if (distance > cutoff) { break; }
403                 
404                         if (distance != -1) { //-1 means skip me
405                         
406                                 //are the distances the same
407                                 if (distance == prevDistance) { //save in vector
408                                         seqDist temp(itA->second, itB->second, distance);
409                                         sameSeqs.push_back(temp);
410                                         exitedBreak = false;
411                                 }else{ 
412                                         next.seq1 = itA->second;
413                                         next.seq2 = itB->second;
414                                         next.dist = distance;
415                                         exitedBreak = true;
416                                         break;
417                                 }
418                         }
419                 }
420                 
421                 //rndomize matching dists
422                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
423                 
424                 return sameSeqs;
425         }
426         catch(exception& e) {
427                 errorOut(e, "HCluster", "getSeqsFNNN");
428                 exit(1);
429         }
430 }
431 //**********************************************************************************************************************
432 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
433 vector<seqDist> HCluster::getSeqsAN(){
434         try {
435                 int firstName, secondName;
436                 float prevDistance;
437                 vector<seqDist> sameSeqs;
438                 prevDistance = -1;
439                 
440                 openInputFile(distfile, filehandle, "no error"); 
441                 
442                 //is the smallest value in mergedMin or the distfile?
443                 float mergedMinDist = 10000;
444                 float distance = 10000;
445                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
446                         
447                 if (!filehandle.eof()) {  
448                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
449                         //save first one
450                         if (prevDistance == -1) { prevDistance = distance; } 
451                         if (distance != -1) { //-1 means skip me
452                                 seqDist temp(firstName, secondName, distance);
453                                 sameSeqs.push_back(temp);
454                         }
455                 }
456                 
457                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
458                         //remove distance we saved from file
459                         sameSeqs.clear();
460                         prevDistance = mergedMinDist;
461                         
462                         for (int i = 0; i < mergedMin.size(); i++) {
463                                 if (mergedMin[i].dist == prevDistance) {
464                                         sameSeqs.push_back(mergedMin[i]);
465                                 }else { break; }
466                         }
467                 }else{ //get minimum from file
468                         //get entry
469                         while (!filehandle.eof()) {
470                                 
471                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
472                                 
473                                 if (prevDistance == -1) { prevDistance = distance; }
474                                 
475                                 if (distance != -1) { //-1 means skip me
476                                         //are the distances the same
477                                         if (distance == prevDistance) { //save in vector
478                                                 seqDist temp(firstName, secondName, distance);
479                                                 sameSeqs.push_back(temp);
480                                         }else{  
481                                                 break;
482                                         }
483                                 }
484                         }
485                 }
486                 filehandle.close();
487                 
488                 //randomize matching dists
489                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
490                 
491                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
492                 vector<seqDist> temp;
493                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
494                 
495                 return temp;
496         }
497         catch(exception& e) {
498                 errorOut(e, "HCluster", "getSeqsAN");
499                 exit(1);
500         }
501 }
502
503 /***********************************************************************/
504 void HCluster::combineFile() {
505         try {
506                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
507                 //char* inputBuffer;
508                 //inputBuffer = new char[bufferSize];
509                 //size_t numRead;
510                 
511                 string tempDistFile = distfile + ".temp";
512                 ofstream out;
513                 openOutputFile(tempDistFile, out);
514                 
515                 //FILE* in;
516                 //in = fopen(distfile.c_str(), "rb");
517         
518                 ifstream in;
519                 openInputFile(distfile, in);
520                 
521                 int first, second;
522                 float dist;
523                 
524                 vector< map<int, float> > smallRowColValues;
525                 smallRowColValues.resize(2);  //0 = row, 1 = col
526                 int count = 0;
527                                 
528                 //go through file pulling out distances related to rows merging
529                 //if mergedMin contains distances add those back into file
530                 //bool done = false;
531                 //partialDist = "";
532                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
533 //cout << "number of char read = " << numRead << endl;
534 //cout << inputBuffer << endl;
535                         //if (numRead < bufferSize) { done = true; }
536                         
537                         //parse input into individual distances
538                         //int spot = 0;
539                         //string outputString = "";
540                         //while(spot < numRead) {
541         //cout << "spot = " << spot << endl;
542                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
543                            
544                            //you read a partial distance
545                           // if (nextDist.seq1 == -1) { break;  }
546                         while (!in.eof()) {
547                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
548         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
549                            //since file is sorted and mergedMin is sorted 
550                            //you can put the smallest distance from each through the code below and keep the file sorted
551                            
552                            in >> first >> second >> dist; gobble(in);
553                            
554                            //while there are still values in mergedMin that are smaller than the distance read from file
555                            while (count < mergedMin.size())  {
556                            
557                                         //is the distance in mergedMin smaller than from the file
558                                         if (mergedMin[count].dist < dist) {
559                                         //is this a distance related to the columns merging?
560                                         //if yes, save in memory
561                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
562                                                 }else if (mergedMin[count].seq1 == smallCol) {
563                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
564                                                 }else if (mergedMin[count].seq2 == smallCol) {
565                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
566                                                 }else if (mergedMin[count].seq1 == smallRow) {
567                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
568                                                 }else if (mergedMin[count].seq2 == smallRow) {
569                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
570                                                 }else { //if no, write to temp file
571                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
572                                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
573                                                 }
574                                                 count++;
575                                         }else{   break; }
576                            }
577                            
578                            //is this a distance related to the columns merging?
579                            //if yes, save in memory
580                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
581                            }else if (first == smallCol) {
582                                         smallRowColValues[1][second] = dist;
583                            }else if (second == smallCol) {
584                                         smallRowColValues[1][first] = dist;
585                            }else if (first == smallRow) {
586                                         smallRowColValues[0][second] = dist;
587                            }else if (second == smallRow) {
588                                         smallRowColValues[0][first] = dist;
589                            
590                            }else { //if no, write to temp file
591                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
592                                         out << first << '\t' << second << '\t' << dist << endl;
593                            }
594                         }
595                         
596                         //out << outputString;
597                         //if(done) { break; }
598                 //}
599                 //fclose(in);
600                 in.close();
601                 
602                 //if values in mergedMin are larger than the the largest in file then
603                 while (count < mergedMin.size())  {  
604                         //is this a distance related to the columns merging?
605                         //if yes, save in memory
606                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
607                         }else if (mergedMin[count].seq1 == smallCol) {
608                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
609                         }else if (mergedMin[count].seq2 == smallCol) {
610                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
611                         }else if (mergedMin[count].seq1 == smallRow) {
612                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
613                         }else if (mergedMin[count].seq2 == smallRow) {
614                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
615                                 
616                         }else { //if no, write to temp file
617                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
618                         }
619                         count++;
620                 }
621                 out.close();
622                 mergedMin.clear();
623                         
624                 //rename tempfile to distfile
625                 remove(distfile.c_str());
626                 rename(tempDistFile.c_str(), distfile.c_str());
627                 
628                 //merge clustered rows averaging the distances
629                 map<int, float>::iterator itMerge;
630                 map<int, float>::iterator it2Merge;
631                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
632                         //does smallRowColValues[1] have a distance to this seq too?
633                         it2Merge = smallRowColValues[1].find(itMerge->first);
634                         
635                         float average;
636                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
637                                 //weighted average
638                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
639                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
640                                 smallRowColValues[1].erase(it2Merge);
641                                 
642                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
643                                 mergedMin.push_back(temp);
644                         }
645                 }
646
647                 //sort merged values
648                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
649         }
650         catch(exception& e) {
651                 errorOut(e, "HCluster", "combineFile");
652                 exit(1);
653         }
654 }
655 /***********************************************************************
656 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
657         try {
658                 seqDist next;
659                 int indexBefore = index;
660                 string first, second, distance;
661                 first = ""; second = ""; distance = "";
662                 int tabCount = 0;
663 //cout << "partial = " << partialDist << endl;          
664                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
665                         for (int i = 0; i < partialDist.size(); i++) {
666                                 if (tabCount == 0) {
667                                         if (partialDist[i] == '\t') { tabCount++; }
668                                         else {  first += partialDist[i];        }
669                                 }else if (tabCount == 1) {
670                                         if (partialDist[i] == '\t') { tabCount++; }
671                                         else {  second += partialDist[i];       }
672                                 }else if (tabCount == 2) {
673                                         distance +=  partialDist[i];
674                                 }
675                         }
676                         partialDist = "";
677                 }
678         
679                 //try to get another distance
680                 bool gotDist = false;
681                 while (index < size) {
682                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
683                                 gotDist = true;
684                                 
685                                 //gobble space
686                                 while (index < size) {          
687                                         if (isspace(buffer[index])) { index++; }
688                                         else { break; }         
689                                 }
690                                 break;
691                         }else{
692                                 if (tabCount == 0) {
693                                         if (buffer[index] == '\t') { tabCount++; }
694                                         else {  first += buffer[index]; }
695                                 }else if (tabCount == 1) {
696                                         if (buffer[index] == '\t') { tabCount++; }
697                                         else {  second += buffer[index];        }
698                                 }else if (tabCount == 2) {
699                                         distance +=  buffer[index];
700                                 }
701                                 index++;
702                         }
703                 }
704                 
705                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
706                 //then you want to save the partial distance.
707                 if (!gotDist) {
708                         for (int i = indexBefore; i < size; i++) {
709                                 partialDist += buffer[i];
710                         }
711                         index = size + 1;
712                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
713                 }else{
714                         int firstname, secondname;
715                         float dist;
716                         
717                         convert(first, firstname);
718                         convert(second, secondname);
719                         convert(distance, dist);
720                         
721                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
722                 }
723                                                 
724                 return next;
725         }
726         catch(exception& e) {
727                 errorOut(e, "HCluster", "getNextDist");
728                 exit(1);
729         }
730 }
731 /***********************************************************************/
732 void HCluster::processFile() {
733         try {
734                 string firstName, secondName;
735                 float distance;
736                 
737                 ifstream in;
738                 openInputFile(distfile, in);
739                 
740                 ofstream out;
741                 string outTemp = distfile + ".temp";
742                 openOutputFile(outTemp, out);
743         
744                 //get entry
745                 while (!in.eof()) {
746                         
747                         in >> firstName >> secondName >> distance;    gobble(in);               
748                         
749                         map<string,int>::iterator itA = nameMap->find(firstName);
750                         map<string,int>::iterator itB = nameMap->find(secondName);
751                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
752                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
753                 
754                         //using cutoff
755                         if (distance > cutoff) { break; }
756                 
757                         if (distance != -1) { //-1 means skip me
758                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
759                         }
760                 }
761                 
762                 in.close();
763                 out.close();
764                 
765                 remove(distfile.c_str());
766                 rename(outTemp.c_str(), distfile.c_str());
767         }
768         catch(exception& e) {
769                 errorOut(e, "HCluster", "processFile");
770                 exit(1);
771         }
772 }
773 /***********************************************************************/
774
775
776
777
778
779
780
781