]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
worked on hcluster. made .single command run using a sharedfile. and various other...
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
17         try {
18                 mapWanted = false;
19                 exitedBreak = false; 
20                 numSeqs = list->getNumSeqs();
21                 
22                 //initialize cluster array
23                 for (int i = 0; i < numSeqs; i++) {
24                         clusterNode temp(1, -1, i);
25                         clusterArray.push_back(temp);
26                 }
27                 
28                 if (method != "average") {
29                         openInputFile(distfile, filehandle);
30                 }else{ firstRead = true; }
31         }
32         catch(exception& e) {
33                 errorOut(e, "HCluster", "HCluster");
34                 exit(1);
35         }
36 }
37 /***********************************************************************/
38
39 void HCluster::clusterBins(){
40         try {
41                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
42
43                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
44                 rabund->set(clusterArray[smallRow].smallChild, 0);      
45                 rabund->setLabel(toString(smallDist));
46
47                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
48         }
49         catch(exception& e) {
50                 errorOut(e, "HCluster", "clusterBins");
51                 exit(1);
52         }
53
54
55 }
56
57 /***********************************************************************/
58
59 void HCluster::clusterNames(){
60         try {
61                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
62                 if (mapWanted) {  updateMap();  }
63                 
64                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
65                 list->set(clusterArray[smallRow].smallChild, "");       
66                 list->setLabel(toString(smallDist));
67         
68                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
69
70     }
71         catch(exception& e) {
72                 errorOut(e, "HCluster", "clusterNames");
73                 exit(1);
74         }
75
76 }
77 /***********************************************************************/
78 int HCluster::getUpmostParent(int node){
79         try {
80                 while (clusterArray[node].parent != -1) {
81                         node = clusterArray[node].parent;
82                 }
83                 
84                 return node;
85         }
86         catch(exception& e) {
87                 errorOut(e, "HCluster", "getUpmostParent");
88                 exit(1);
89         }
90 }
91 /***********************************************************************/
92 void HCluster::printInfo(){
93         try {
94                 
95                 cout << "link table" << endl;
96                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
97                         cout << itActive->first << " = " << itActive->second << endl;
98                 }
99                 cout << endl;
100                 for (int i = 0; i < linkTable.size(); i++) {
101                         cout << i << '\t';
102                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
103                                 cout << it->first << '-' << it->second << '\t' ;
104                         }
105                         cout << endl;
106                 }
107                 cout << endl << "clusterArray" << endl;
108                 
109                 for (int i = 0; i < clusterArray.size(); i++) {
110                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
111                 }
112                 cout << endl;
113                 
114                 
115         }
116         catch(exception& e) {
117                 errorOut(e, "HCluster", "getUpmostParent");
118                 exit(1);
119         }
120 }
121 /***********************************************************************/
122 int HCluster::makeActive() {
123         try {
124                 int linkValue = 1; 
125
126                 itActive = activeLinks.find(smallRow);
127                 it2Active = activeLinks.find(smallCol);
128                 
129                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
130                         int size = linkTable.size();
131                         map<int, int> temp; map<int, int> temp2;
132                         
133                         //add link to eachother
134                         temp[smallRow] = 1;                                                     //         1    2                                                               
135                         temp2[smallCol] = 1;                                            // 1   0        1 
136                                                                                                                 // 2   1        0                               
137                         linkTable.push_back(temp);
138                         linkTable.push_back(temp2);
139                         
140                         //add to activeLinks
141                         activeLinks[smallRow] = size;
142                         activeLinks[smallCol] = size+1;
143
144                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
145                          int size = linkTable.size();
146                          int alreadyActiveRow = itActive->second;
147                          map<int, int> temp; 
148                         
149                         //add link to eachother
150                         temp[smallRow] = 1;                                                                     //         6    2       3       5
151                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
152                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
153                                                                                                                                 // 3   2        1       0       0
154                                                                                                                                 // 5   0    1   0   0   
155                         //add to activeLinks
156                         activeLinks[smallCol] = size;
157         
158                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
159                          int size = linkTable.size();
160                          int alreadyActiveCol = it2Active->second;
161                          map<int, int> temp; 
162                         
163                         //add link to eachother
164                         temp[smallCol] = 1;                                                                     //         6    2       3       5
165                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
166                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
167                                                                                                                                 // 3   2        1       0       0
168                                                                                                                                 // 5   0    1   0   0   
169                         //add to activeLinks
170                         activeLinks[smallRow] = size;
171
172                 }else { //both are active so add one
173                         int row = itActive->second;
174                         int col = it2Active->second;
175                         
176                         
177                         linkTable[row][smallCol]++;
178                         linkTable[col][smallRow]++;
179                         linkValue = linkTable[row][smallCol];
180                 }
181                 
182                 return linkValue;
183         }
184         catch(exception& e) {
185                 errorOut(e, "HCluster", "makeActive");
186                 exit(1);
187         }
188 }
189 /***********************************************************************/
190 void HCluster::updateArrayandLinkTable() {
191         try {
192                 //if cluster was made update clusterArray and linkTable
193                 int size = clusterArray.size();
194                 
195                 //add new node
196                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
197                 clusterArray.push_back(temp);
198                 
199                 //update child nodes
200                 clusterArray[smallRow].parent = size;
201                 clusterArray[smallCol].parent = size;
202                 
203                 if (method == "furthest") {
204                         
205                         //update linkTable by merging clustered rows and columns
206                         int rowSpot = activeLinks[smallRow];
207                         int colSpot = activeLinks[smallCol];
208                         
209                         //fix old rows
210                         for (int i = 0; i < linkTable.size(); i++) {
211                                 //check if they are in map
212                                 it = linkTable[i].find(smallRow);
213                                 it2 = linkTable[i].find(smallCol);
214                                 
215                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
216                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
217                                         linkTable[i].erase(smallCol); //delete col row
218                                         linkTable[i].erase(smallRow); //delete col row
219                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
220                                         linkTable[i][size] = linkTable[i][smallCol];
221                                         linkTable[i].erase(smallCol); //delete col 
222                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
223                                         linkTable[i][size] = linkTable[i][smallRow];
224                                         linkTable[i].erase(smallRow); //delete col 
225                                 }
226                         }
227                         
228                         //merge their values
229                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
230                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
231                                 
232                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
233                                         linkTable[colSpot][it->first] = it->second;
234                                 }else { //merge them
235                                         linkTable[colSpot][it->first] = it->second + it2->second;
236                                 }
237                         }
238                         
239                         linkTable[colSpot].erase(size);
240                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
241                         
242                         //update activerows
243                         activeLinks.erase(smallRow);
244                         activeLinks.erase(smallCol);
245                         activeLinks[size] = colSpot;
246                         
247                         //adjust everybody elses spot since you deleted - time vs. space
248                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
249                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
250                         }
251                 }
252         }
253         catch(exception& e) {
254                 errorOut(e, "HCluster", "updateArrayandLinkTable");
255                 exit(1);
256         }
257 }
258 /***********************************************************************/
259 bool HCluster::update(int row, int col, float distance){
260         try {
261                 bool cluster = false;
262                 smallRow = row;
263                 smallCol = col;
264                 smallDist = distance;
265                 
266                 //find upmost parent of row and col
267                 smallRow = getUpmostParent(smallRow);
268                 smallCol = getUpmostParent(smallCol);
269
270                 //you don't want to cluster with yourself
271                 if (smallRow != smallCol) {
272                         
273                         if (method != "average") {
274                                 //can we cluster???
275                                 if (method == "nearest") { cluster = true;  }
276                                 else{ //assume furthest
277                                         //are they active in the link table
278                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
279                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
280                                 }
281                                 
282                                 if (cluster) { 
283                                         updateArrayandLinkTable();
284                                         clusterBins();
285                                         clusterNames();
286                                 }
287                         }else {
288                                 cluster = true;
289                                 updateArrayandLinkTable();
290                                 clusterBins();
291                                 clusterNames();
292                                 combineFile();
293                         }
294                 }
295                 
296                 return cluster;
297                 //printInfo();
298         }
299         catch(exception& e) {
300                 errorOut(e, "HCluster", "update");
301                 exit(1);
302         }
303 }
304 /***********************************************************************/
305 void HCluster::setMapWanted(bool m)  {  
306         try {
307                 mapWanted = m;
308                 
309                 //initialize map
310                 for (int i = 0; i < list->getNumBins(); i++) {
311                         
312                         //parse bin 
313                         string names = list->get(i);
314                         while (names.find_first_of(',') != -1) { 
315                                 //get name from bin
316                                 string name = names.substr(0,names.find_first_of(','));
317                                 //save name and bin number
318                                 seq2Bin[name] = i;
319                                 names = names.substr(names.find_first_of(',')+1, names.length());
320                         }
321                         
322                         //get last name
323                         seq2Bin[names] = i;
324                 }
325                 
326         }
327         catch(exception& e) {
328                 errorOut(e, "HCluster", "setMapWanted");
329                 exit(1);
330         }
331 }
332 /***********************************************************************/
333 void HCluster::updateMap() {
334 try {
335                 //update location of seqs in smallRow since they move to smallCol now
336                 string names = list->get(clusterArray[smallRow].smallChild);
337                 while (names.find_first_of(',') != -1) { 
338                         //get name from bin
339                         string name = names.substr(0,names.find_first_of(','));
340                         //save name and bin number
341                         seq2Bin[name] = clusterArray[smallCol].smallChild;
342                         names = names.substr(names.find_first_of(',')+1, names.length());
343                 }
344                         
345                 //get last name
346                 seq2Bin[names] = clusterArray[smallCol].smallChild;
347         }
348         catch(exception& e) {
349                 errorOut(e, "HCluster", "updateMap");
350                 exit(1);
351         }
352 }
353 //**********************************************************************************************************************
354 vector<seqDist> HCluster::getSeqs(){
355         try {
356                 vector<seqDist> sameSeqs;
357                 
358                 if(method != "average") {
359                         sameSeqs = getSeqsFNNN();
360                 }else{
361                         if (firstRead) {        processFile();   }
362                         sameSeqs = getSeqsAN(); 
363                 }
364                                 
365                 return sameSeqs;
366         }
367         catch(exception& e) {
368                 errorOut(e, "HCluster", "getSeqs");
369                 exit(1);
370         }
371 }
372 //**********************************************************************************************************************
373 vector<seqDist> HCluster::getSeqsFNNN(){
374         try {
375                 string firstName, secondName;
376                 float distance, prevDistance;
377                 vector<seqDist> sameSeqs;
378                 prevDistance = -1;
379         
380                 //if you are not at the beginning of the file
381                 if (exitedBreak) { 
382                         sameSeqs.push_back(next);
383                         prevDistance = next.dist;
384                         exitedBreak = false;
385                 }
386         
387                 //get entry
388                 while (!filehandle.eof()) {
389                         
390                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
391         
392                         //save first one
393                         if (prevDistance == -1) { prevDistance = distance; }
394                         
395                         map<string,int>::iterator itA = nameMap->find(firstName);
396                         map<string,int>::iterator itB = nameMap->find(secondName);
397                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
398                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
399                 
400                         //using cutoff
401                         if (distance > cutoff) { break; }
402                 
403                         if (distance != -1) { //-1 means skip me
404                         
405                                 //are the distances the same
406                                 if (distance == prevDistance) { //save in vector
407                                         seqDist temp(itA->second, itB->second, distance);
408                                         sameSeqs.push_back(temp);
409                                         exitedBreak = false;
410                                 }else{ 
411                                         next.seq1 = itA->second;
412                                         next.seq2 = itB->second;
413                                         next.dist = distance;
414                                         exitedBreak = true;
415                                         break;
416                                 }
417                         }
418                 }
419                 
420                 //rndomize matching dists
421                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
422                 
423                 return sameSeqs;
424         }
425         catch(exception& e) {
426                 errorOut(e, "HCluster", "getSeqsFNNN");
427                 exit(1);
428         }
429 }
430 //**********************************************************************************************************************
431 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
432 vector<seqDist> HCluster::getSeqsAN(){
433         try {
434                 int firstName, secondName;
435                 float prevDistance;
436                 vector<seqDist> sameSeqs;
437                 prevDistance = -1;
438                 
439                 openInputFile(distfile, filehandle, "no error"); 
440                 
441                 //is the smallest value in mergedMin or the distfile?
442                 float mergedMinDist = 10000;
443                 float distance = 10000;
444                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
445                         
446                 if (!filehandle.eof()) {  
447                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
448                         //save first one
449                         if (prevDistance == -1) { prevDistance = distance; } 
450                         if (distance != -1) { //-1 means skip me
451                                 seqDist temp(firstName, secondName, distance);
452                                 sameSeqs.push_back(temp);
453                         }
454                 }
455                 
456                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
457                         //remove distance we saved from file
458                         sameSeqs.clear();
459                         prevDistance = mergedMinDist;
460                         
461                         for (int i = 0; i < mergedMin.size(); i++) {
462                                 if (mergedMin[i].dist == prevDistance) {
463                                         sameSeqs.push_back(mergedMin[i]);
464                                 }else { break; }
465                         }
466                 }else{ //get minimum from file
467                         //get entry
468                         while (!filehandle.eof()) {
469                                 
470                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
471                                 
472                                 if (prevDistance == -1) { prevDistance = distance; }
473                                 
474                                 if (distance != -1) { //-1 means skip me
475                                         //are the distances the same
476                                         if (distance == prevDistance) { //save in vector
477                                                 seqDist temp(firstName, secondName, distance);
478                                                 sameSeqs.push_back(temp);
479                                         }else{  
480                                                 break;
481                                         }
482                                 }
483                         }
484                 }
485                 filehandle.close();
486                 
487                 //randomize matching dists
488                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
489                 
490                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
491                 vector<seqDist> temp;
492                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
493                 
494                 return temp;
495         }
496         catch(exception& e) {
497                 errorOut(e, "HCluster", "getSeqsAN");
498                 exit(1);
499         }
500 }
501
502 /***********************************************************************/
503 void HCluster::combineFile() {
504         try {
505                 int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
506                 char* inputBuffer;
507                 inputBuffer = new char[bufferSize];
508                 size_t numRead;
509                 
510                 string tempDistFile = distfile + ".temp";
511                 ofstream out;
512                 openOutputFile(tempDistFile, out);
513                 
514                 FILE* in;
515                 in = fopen(distfile.c_str(), "rb");
516         
517                 int first, second;
518                 float dist;
519                 
520                 vector< map<int, float> > smallRowColValues;
521                 smallRowColValues.resize(2);  //0 = row, 1 = col
522                 int count = 0;
523                                 
524                 //go through file pulling out distances related to rows merging
525                 //if mergedMin contains distances add those back into file
526                 bool done = false;
527                 partialDist = "";
528                 while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
529 //cout << "number of char read = " << numRead << endl;
530 //cout << inputBuffer << endl;
531                         if (numRead < bufferSize) { done = true; }
532                         
533                         //parse input into individual distances
534                         int spot = 0;
535                         string outputString = "";
536                         while(spot < numRead) {
537         //cout << "spot = " << spot << endl;
538                            seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
539                            
540                            //you read a partial distance
541                            if (nextDist.seq1 == -1) { break;  }
542                            
543                            first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
544         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
545                            //since file is sorted and mergedMin is sorted 
546                            //you can put the smallest distance from each through the code below and keep the file sorted
547                            
548                            //while there are still values in mergedMin that are smaller than the distance read from file
549                            while (count < mergedMin.size())  {
550                            
551                                         //is the distance in mergedMin smaller than from the file
552                                         if (mergedMin[count].dist < dist) {
553                                         //is this a distance related to the columns merging?
554                                         //if yes, save in memory
555                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
556                                                 }else if (mergedMin[count].seq1 == smallCol) {
557                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
558                                                 }else if (mergedMin[count].seq2 == smallCol) {
559                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
560                                                 }else if (mergedMin[count].seq1 == smallRow) {
561                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
562                                                 }else if (mergedMin[count].seq2 == smallRow) {
563                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
564                                                 }else { //if no, write to temp file
565                                                         outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
566                                                 }
567                                                 count++;
568                                         }else{   break; }
569                            }
570                            
571                            //is this a distance related to the columns merging?
572                            //if yes, save in memory
573                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
574                            }else if (first == smallCol) {
575                                         smallRowColValues[1][second] = dist;
576                            }else if (second == smallCol) {
577                                         smallRowColValues[1][first] = dist;
578                            }else if (first == smallRow) {
579                                         smallRowColValues[0][second] = dist;
580                            }else if (second == smallRow) {
581                                         smallRowColValues[0][first] = dist;
582                            
583                            }else { //if no, write to temp file
584                                         outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
585                            }
586                         }
587                         
588                         out << outputString;
589                         if(done) { break; }
590                 }
591                 fclose(in);
592                 
593                 //if values in mergedMin are larger than the the largest in file then
594                 while (count < mergedMin.size())  {  
595                         //is this a distance related to the columns merging?
596                         //if yes, save in memory
597                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
598                         }else if (mergedMin[count].seq1 == smallCol) {
599                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
600                         }else if (mergedMin[count].seq2 == smallCol) {
601                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
602                         }else if (mergedMin[count].seq1 == smallRow) {
603                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
604                         }else if (mergedMin[count].seq2 == smallRow) {
605                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
606                                 
607                         }else { //if no, write to temp file
608                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
609                         }
610                         count++;
611                 }
612                 out.close();
613                 mergedMin.clear();
614                         
615                 //rename tempfile to distfile
616                 remove(distfile.c_str());
617                 rename(tempDistFile.c_str(), distfile.c_str());
618                 
619                 //merge clustered rows averaging the distances
620                 map<int, float>::iterator itMerge;
621                 map<int, float>::iterator it2Merge;
622                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
623                         //does smallRowColValues[1] have a distance to this seq too?
624                         it2Merge = smallRowColValues[1].find(itMerge->first);
625                         
626                         float average;
627                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
628                                 //weighted average
629                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
630                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
631                                 smallRowColValues[1].erase(it2Merge);
632                                 
633                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
634                                 mergedMin.push_back(temp);
635                         }
636                 }
637
638                 //sort merged values
639                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
640         }
641         catch(exception& e) {
642                 errorOut(e, "HCluster", "combineFile");
643                 exit(1);
644         }
645 }
646 /***********************************************************************/
647 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
648         try {
649                 seqDist next;
650                 int indexBefore = index;
651                 string first, second, distance;
652                 first = ""; second = ""; distance = "";
653                 int tabCount = 0;
654 //cout << "partial = " << partialDist << endl;          
655                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
656                         for (int i = 0; i < partialDist.size(); i++) {
657                                 if (tabCount == 0) {
658                                         if (partialDist[i] == '\t') { tabCount++; }
659                                         else {  first += partialDist[i];        }
660                                 }else if (tabCount == 1) {
661                                         if (partialDist[i] == '\t') { tabCount++; }
662                                         else {  second += partialDist[i];       }
663                                 }else if (tabCount == 2) {
664                                         distance +=  partialDist[i];
665                                 }
666                         }
667                         partialDist = "";
668                 }
669         
670                 //try to get another distance
671                 bool gotDist = false;
672                 while (index < size) {
673                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
674                                 gotDist = true;
675                                 
676                                 //gobble space
677                                 while (index < size) {          
678                                         if (isspace(buffer[index])) { index++; }
679                                         else { break; }         
680                                 }
681                                 break;
682                         }else{
683                                 if (tabCount == 0) {
684                                         if (buffer[index] == '\t') { tabCount++; }
685                                         else {  first += buffer[index]; }
686                                 }else if (tabCount == 1) {
687                                         if (buffer[index] == '\t') { tabCount++; }
688                                         else {  second += buffer[index];        }
689                                 }else if (tabCount == 2) {
690                                         distance +=  buffer[index];
691                                 }
692                                 index++;
693                         }
694                 }
695                 
696                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
697                 //then you want to save the partial distance.
698                 if (!gotDist) {
699                         for (int i = indexBefore; i < size; i++) {
700                                 partialDist += buffer[i];
701                         }
702                         index = size + 1;
703                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
704                 }else{
705                         int firstname, secondname;
706                         float dist;
707                         
708                         convert(first, firstname);
709                         convert(second, secondname);
710                         convert(distance, dist);
711                         
712                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
713                 }
714                                                 
715                 return next;
716         }
717         catch(exception& e) {
718                 errorOut(e, "HCluster", "getNextDist");
719                 exit(1);
720         }
721 }
722 /***********************************************************************/
723 void HCluster::processFile() {
724         try {
725                 string firstName, secondName;
726                 float distance;
727                 
728                 ifstream in;
729                 openInputFile(distfile, in);
730                 
731                 ofstream out;
732                 string outTemp = distfile + ".temp";
733                 openOutputFile(outTemp, out);
734         
735                 //get entry
736                 while (!in.eof()) {
737                         
738                         in >> firstName >> secondName >> distance;    gobble(in);               
739                         
740                         map<string,int>::iterator itA = nameMap->find(firstName);
741                         map<string,int>::iterator itB = nameMap->find(secondName);
742                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
743                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
744                 
745                         //using cutoff
746                         if (distance > cutoff) { break; }
747                 
748                         if (distance != -1) { //-1 means skip me
749                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
750                         }
751                 }
752                 
753                 in.close();
754                 out.close();
755                 
756                 remove(distfile.c_str());
757                 rename(outTemp.c_str(), distfile.c_str());
758                 
759                 firstRead = false;
760         }
761         catch(exception& e) {
762                 errorOut(e, "HCluster", "processFile");
763                 exit(1);
764         }
765 }
766 /***********************************************************************/
767
768
769
770
771
772
773
774