]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
added warning about average neighbor merges around cutoff. fixed little bugs.
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
17         try {
18                 mapWanted = false;
19                 exitedBreak = false; 
20                 numSeqs = list->getNumSeqs();
21                 
22                 //initialize cluster array
23                 for (int i = 0; i < numSeqs; i++) {
24                         clusterNode temp(1, -1, i);
25                         clusterArray.push_back(temp);
26                 }
27                 
28                 if (method != "average") {
29                         openInputFile(distfile, filehandle);
30                 }else{  
31                         processFile();  
32                 }
33         }
34         catch(exception& e) {
35                 errorOut(e, "HCluster", "HCluster");
36                 exit(1);
37         }
38 }
39 /***********************************************************************/
40
41 void HCluster::clusterBins(){
42         try {
43                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
44
45                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
46                 rabund->set(clusterArray[smallRow].smallChild, 0);      
47                 rabund->setLabel(toString(smallDist));
48
49                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
50         }
51         catch(exception& e) {
52                 errorOut(e, "HCluster", "clusterBins");
53                 exit(1);
54         }
55
56
57 }
58
59 /***********************************************************************/
60
61 void HCluster::clusterNames(){
62         try {
63                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
64                 if (mapWanted) {  updateMap();  }
65                 
66                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
67                 list->set(clusterArray[smallRow].smallChild, "");       
68                 list->setLabel(toString(smallDist));
69         
70                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
71
72     }
73         catch(exception& e) {
74                 errorOut(e, "HCluster", "clusterNames");
75                 exit(1);
76         }
77
78 }
79 /***********************************************************************/
80 int HCluster::getUpmostParent(int node){
81         try {
82                 while (clusterArray[node].parent != -1) {
83                         node = clusterArray[node].parent;
84                 }
85                 
86                 return node;
87         }
88         catch(exception& e) {
89                 errorOut(e, "HCluster", "getUpmostParent");
90                 exit(1);
91         }
92 }
93 /***********************************************************************/
94 void HCluster::printInfo(){
95         try {
96                 
97                 cout << "link table" << endl;
98                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
99                         cout << itActive->first << " = " << itActive->second << endl;
100                 }
101                 cout << endl;
102                 for (int i = 0; i < linkTable.size(); i++) {
103                         cout << i << '\t';
104                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
105                                 cout << it->first << '-' << it->second << '\t' ;
106                         }
107                         cout << endl;
108                 }
109                 cout << endl << "clusterArray" << endl;
110                 
111                 for (int i = 0; i < clusterArray.size(); i++) {
112                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
113                 }
114                 cout << endl;
115                 
116                 
117         }
118         catch(exception& e) {
119                 errorOut(e, "HCluster", "getUpmostParent");
120                 exit(1);
121         }
122 }
123 /***********************************************************************/
124 int HCluster::makeActive() {
125         try {
126                 int linkValue = 1; 
127
128                 itActive = activeLinks.find(smallRow);
129                 it2Active = activeLinks.find(smallCol);
130                 
131                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
132                         int size = linkTable.size();
133                         map<int, int> temp; map<int, int> temp2;
134                         
135                         //add link to eachother
136                         temp[smallRow] = 1;                                                     //         1    2                                                               
137                         temp2[smallCol] = 1;                                            // 1   0        1 
138                                                                                                                 // 2   1        0                               
139                         linkTable.push_back(temp);
140                         linkTable.push_back(temp2);
141                         
142                         //add to activeLinks
143                         activeLinks[smallRow] = size;
144                         activeLinks[smallCol] = size+1;
145
146                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
147                          int size = linkTable.size();
148                          int alreadyActiveRow = itActive->second;
149                          map<int, int> temp; 
150                         
151                         //add link to eachother
152                         temp[smallRow] = 1;                                                                     //         6    2       3       5
153                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
154                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
155                                                                                                                                 // 3   2        1       0       0
156                                                                                                                                 // 5   0    1   0   0   
157                         //add to activeLinks
158                         activeLinks[smallCol] = size;
159         
160                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
161                          int size = linkTable.size();
162                          int alreadyActiveCol = it2Active->second;
163                          map<int, int> temp; 
164                         
165                         //add link to eachother
166                         temp[smallCol] = 1;                                                                     //         6    2       3       5
167                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
168                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
169                                                                                                                                 // 3   2        1       0       0
170                                                                                                                                 // 5   0    1   0   0   
171                         //add to activeLinks
172                         activeLinks[smallRow] = size;
173
174                 }else { //both are active so add one
175                         int row = itActive->second;
176                         int col = it2Active->second;
177                         
178                         
179                         linkTable[row][smallCol]++;
180                         linkTable[col][smallRow]++;
181                         linkValue = linkTable[row][smallCol];
182                 }
183                 
184                 return linkValue;
185         }
186         catch(exception& e) {
187                 errorOut(e, "HCluster", "makeActive");
188                 exit(1);
189         }
190 }
191 /***********************************************************************/
192 void HCluster::updateArrayandLinkTable() {
193         try {
194                 //if cluster was made update clusterArray and linkTable
195                 int size = clusterArray.size();
196                 
197                 //add new node
198                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
199                 clusterArray.push_back(temp);
200                 
201                 //update child nodes
202                 clusterArray[smallRow].parent = size;
203                 clusterArray[smallCol].parent = size;
204                 
205                 if (method == "furthest") {
206                         
207                         //update linkTable by merging clustered rows and columns
208                         int rowSpot = activeLinks[smallRow];
209                         int colSpot = activeLinks[smallCol];
210                         
211                         //fix old rows
212                         for (int i = 0; i < linkTable.size(); i++) {
213                                 //check if they are in map
214                                 it = linkTable[i].find(smallRow);
215                                 it2 = linkTable[i].find(smallCol);
216                                 
217                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
218                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
219                                         linkTable[i].erase(smallCol); //delete col row
220                                         linkTable[i].erase(smallRow); //delete col row
221                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
222                                         linkTable[i][size] = linkTable[i][smallCol];
223                                         linkTable[i].erase(smallCol); //delete col 
224                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
225                                         linkTable[i][size] = linkTable[i][smallRow];
226                                         linkTable[i].erase(smallRow); //delete col 
227                                 }
228                         }
229                         
230                         //merge their values
231                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
232                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
233                                 
234                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
235                                         linkTable[colSpot][it->first] = it->second;
236                                 }else { //merge them
237                                         linkTable[colSpot][it->first] = it->second + it2->second;
238                                 }
239                         }
240                         
241                         linkTable[colSpot].erase(size);
242                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
243                         
244                         //update activerows
245                         activeLinks.erase(smallRow);
246                         activeLinks.erase(smallCol);
247                         activeLinks[size] = colSpot;
248                         
249                         //adjust everybody elses spot since you deleted - time vs. space
250                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
251                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
252                         }
253                 }
254         }
255         catch(exception& e) {
256                 errorOut(e, "HCluster", "updateArrayandLinkTable");
257                 exit(1);
258         }
259 }
260 /***********************************************************************/
261 bool HCluster::update(int row, int col, float distance){
262         try {
263                 bool cluster = false;
264                 smallRow = row;
265                 smallCol = col;
266                 smallDist = distance;
267                 
268                 //find upmost parent of row and col
269                 smallRow = getUpmostParent(smallRow);
270                 smallCol = getUpmostParent(smallCol);
271
272                 //you don't want to cluster with yourself
273                 if (smallRow != smallCol) {
274                         
275                         if (method != "average") {
276                                 //can we cluster???
277                                 if (method == "nearest") { cluster = true;  }
278                                 else{ //assume furthest
279                                         //are they active in the link table
280                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
281                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
282                                 }
283                                 
284                                 if (cluster) { 
285                                         updateArrayandLinkTable();
286                                         clusterBins();
287                                         clusterNames();
288                                 }
289                         }else {
290                                 cluster = true;
291                                 updateArrayandLinkTable();
292                                 clusterBins();
293                                 clusterNames();
294                                 combineFile();
295                         }
296                 }
297                 
298                 return cluster;
299                 //printInfo();
300         }
301         catch(exception& e) {
302                 errorOut(e, "HCluster", "update");
303                 exit(1);
304         }
305 }
306 /***********************************************************************/
307 void HCluster::setMapWanted(bool m)  {  
308         try {
309                 mapWanted = m;
310                 
311                 //initialize map
312                 for (int i = 0; i < list->getNumBins(); i++) {
313                         
314                         //parse bin 
315                         string names = list->get(i);
316                         while (names.find_first_of(',') != -1) { 
317                                 //get name from bin
318                                 string name = names.substr(0,names.find_first_of(','));
319                                 //save name and bin number
320                                 seq2Bin[name] = i;
321                                 names = names.substr(names.find_first_of(',')+1, names.length());
322                         }
323                         
324                         //get last name
325                         seq2Bin[names] = i;
326                 }
327                 
328         }
329         catch(exception& e) {
330                 errorOut(e, "HCluster", "setMapWanted");
331                 exit(1);
332         }
333 }
334 /***********************************************************************/
335 void HCluster::updateMap() {
336 try {
337                 //update location of seqs in smallRow since they move to smallCol now
338                 string names = list->get(clusterArray[smallRow].smallChild);
339                 while (names.find_first_of(',') != -1) { 
340                         //get name from bin
341                         string name = names.substr(0,names.find_first_of(','));
342                         //save name and bin number
343                         seq2Bin[name] = clusterArray[smallCol].smallChild;
344                         names = names.substr(names.find_first_of(',')+1, names.length());
345                 }
346                         
347                 //get last name
348                 seq2Bin[names] = clusterArray[smallCol].smallChild;
349         }
350         catch(exception& e) {
351                 errorOut(e, "HCluster", "updateMap");
352                 exit(1);
353         }
354 }
355 //**********************************************************************************************************************
356 vector<seqDist> HCluster::getSeqs(){
357         try {
358                 vector<seqDist> sameSeqs;
359                 
360                 if(method != "average") {
361                         sameSeqs = getSeqsFNNN();
362                 }else{
363                         sameSeqs = getSeqsAN(); 
364                 }
365                                 
366                 return sameSeqs;
367         }
368         catch(exception& e) {
369                 errorOut(e, "HCluster", "getSeqs");
370                 exit(1);
371         }
372 }
373 //**********************************************************************************************************************
374 vector<seqDist> HCluster::getSeqsFNNN(){
375         try {
376                 string firstName, secondName;
377                 float distance, prevDistance;
378                 vector<seqDist> sameSeqs;
379                 prevDistance = -1;
380         
381                 //if you are not at the beginning of the file
382                 if (exitedBreak) { 
383                         sameSeqs.push_back(next);
384                         prevDistance = next.dist;
385                         exitedBreak = false;
386                 }
387         
388                 //get entry
389                 while (!filehandle.eof()) {
390                         
391                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
392         
393                         //save first one
394                         if (prevDistance == -1) { prevDistance = distance; }
395                         
396                         map<string,int>::iterator itA = nameMap->find(firstName);
397                         map<string,int>::iterator itB = nameMap->find(secondName);
398                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
399                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
400                 
401                         //using cutoff
402                         if (distance > cutoff) { break; }
403                 
404                         if (distance != -1) { //-1 means skip me
405                         
406                                 //are the distances the same
407                                 if (distance == prevDistance) { //save in vector
408                                         seqDist temp(itA->second, itB->second, distance);
409                                         sameSeqs.push_back(temp);
410                                         exitedBreak = false;
411                                 }else{ 
412                                         next.seq1 = itA->second;
413                                         next.seq2 = itB->second;
414                                         next.dist = distance;
415                                         exitedBreak = true;
416                                         break;
417                                 }
418                         }
419                 }
420                 
421                 //rndomize matching dists
422                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
423                 
424                 return sameSeqs;
425         }
426         catch(exception& e) {
427                 errorOut(e, "HCluster", "getSeqsFNNN");
428                 exit(1);
429         }
430 }
431 //**********************************************************************************************************************
432 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
433 vector<seqDist> HCluster::getSeqsAN(){
434         try {
435                 int firstName, secondName;
436                 float prevDistance;
437                 vector<seqDist> sameSeqs;
438                 prevDistance = -1;
439                 
440                 openInputFile(distfile, filehandle, "no error"); 
441                 
442                 //is the smallest value in mergedMin or the distfile?
443                 float mergedMinDist = 10000;
444                 float distance = 10000;
445                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
446                         
447                 if (!filehandle.eof()) {  
448                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
449                         //save first one
450                         if (prevDistance == -1) { prevDistance = distance; } 
451                         if (distance != -1) { //-1 means skip me
452                                 seqDist temp(firstName, secondName, distance);
453                                 sameSeqs.push_back(temp);
454                         }
455                 }
456                 
457                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
458                         //remove distance we saved from file
459                         sameSeqs.clear();
460                         prevDistance = mergedMinDist;
461                         
462                         for (int i = 0; i < mergedMin.size(); i++) {
463                                 if (mergedMin[i].dist == prevDistance) {
464                                         sameSeqs.push_back(mergedMin[i]);
465                                 }else { break; }
466                         }
467                 }else{ //get minimum from file
468                         //get entry
469                         while (!filehandle.eof()) {
470                                 
471                                 filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
472                                 
473                                 if (prevDistance == -1) { prevDistance = distance; }
474                                 
475                                 if (distance != -1) { //-1 means skip me
476                                         //are the distances the same
477                                         if (distance == prevDistance) { //save in vector
478                                                 seqDist temp(firstName, secondName, distance);
479                                                 sameSeqs.push_back(temp);
480                                         }else{  
481                                                 break;
482                                         }
483                                 }
484                         }
485                 }
486                 filehandle.close();
487                 
488                 //randomize matching dists
489                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
490                 
491                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
492                 vector<seqDist> temp;
493                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
494                 
495                 return temp;
496         }
497         catch(exception& e) {
498                 errorOut(e, "HCluster", "getSeqsAN");
499                 exit(1);
500         }
501 }
502
503 /***********************************************************************/
504 void HCluster::combineFile() {
505         try {
506                 int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
507                 char* inputBuffer;
508                 inputBuffer = new char[bufferSize];
509                 size_t numRead;
510                 
511                 string tempDistFile = distfile + ".temp";
512                 ofstream out;
513                 openOutputFile(tempDistFile, out);
514                 
515                 FILE* in;
516                 in = fopen(distfile.c_str(), "rb");
517         
518                 int first, second;
519                 float dist;
520                 
521                 vector< map<int, float> > smallRowColValues;
522                 smallRowColValues.resize(2);  //0 = row, 1 = col
523                 int count = 0;
524                                 
525                 //go through file pulling out distances related to rows merging
526                 //if mergedMin contains distances add those back into file
527                 bool done = false;
528                 partialDist = "";
529                 while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
530 //cout << "number of char read = " << numRead << endl;
531 //cout << inputBuffer << endl;
532                         if (numRead < bufferSize) { done = true; }
533                         
534                         //parse input into individual distances
535                         int spot = 0;
536                         string outputString = "";
537                         while(spot < numRead) {
538         //cout << "spot = " << spot << endl;
539                            seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
540                            
541                            //you read a partial distance
542                            if (nextDist.seq1 == -1) { break;  }
543                            
544                            first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
545         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
546                            //since file is sorted and mergedMin is sorted 
547                            //you can put the smallest distance from each through the code below and keep the file sorted
548                            
549                            //while there are still values in mergedMin that are smaller than the distance read from file
550                            while (count < mergedMin.size())  {
551                            
552                                         //is the distance in mergedMin smaller than from the file
553                                         if (mergedMin[count].dist < dist) {
554                                         //is this a distance related to the columns merging?
555                                         //if yes, save in memory
556                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
557                                                 }else if (mergedMin[count].seq1 == smallCol) {
558                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
559                                                 }else if (mergedMin[count].seq2 == smallCol) {
560                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
561                                                 }else if (mergedMin[count].seq1 == smallRow) {
562                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
563                                                 }else if (mergedMin[count].seq2 == smallRow) {
564                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
565                                                 }else { //if no, write to temp file
566                                                         outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
567                                                 }
568                                                 count++;
569                                         }else{   break; }
570                            }
571                            
572                            //is this a distance related to the columns merging?
573                            //if yes, save in memory
574                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
575                            }else if (first == smallCol) {
576                                         smallRowColValues[1][second] = dist;
577                            }else if (second == smallCol) {
578                                         smallRowColValues[1][first] = dist;
579                            }else if (first == smallRow) {
580                                         smallRowColValues[0][second] = dist;
581                            }else if (second == smallRow) {
582                                         smallRowColValues[0][first] = dist;
583                            
584                            }else { //if no, write to temp file
585                                         outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
586                            }
587                         }
588                         
589                         out << outputString;
590                         if(done) { break; }
591                 }
592                 fclose(in);
593                 
594                 //if values in mergedMin are larger than the the largest in file then
595                 while (count < mergedMin.size())  {  
596                         //is this a distance related to the columns merging?
597                         //if yes, save in memory
598                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
599                         }else if (mergedMin[count].seq1 == smallCol) {
600                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
601                         }else if (mergedMin[count].seq2 == smallCol) {
602                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
603                         }else if (mergedMin[count].seq1 == smallRow) {
604                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
605                         }else if (mergedMin[count].seq2 == smallRow) {
606                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
607                                 
608                         }else { //if no, write to temp file
609                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
610                         }
611                         count++;
612                 }
613                 out.close();
614                 mergedMin.clear();
615                         
616                 //rename tempfile to distfile
617                 remove(distfile.c_str());
618                 rename(tempDistFile.c_str(), distfile.c_str());
619                 
620                 //merge clustered rows averaging the distances
621                 map<int, float>::iterator itMerge;
622                 map<int, float>::iterator it2Merge;
623                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
624                         //does smallRowColValues[1] have a distance to this seq too?
625                         it2Merge = smallRowColValues[1].find(itMerge->first);
626                         
627                         float average;
628                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
629                                 //weighted average
630                                 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
631                                 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
632                                 smallRowColValues[1].erase(it2Merge);
633                                 
634                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
635                                 mergedMin.push_back(temp);
636                         }
637                 }
638
639                 //sort merged values
640                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
641         }
642         catch(exception& e) {
643                 errorOut(e, "HCluster", "combineFile");
644                 exit(1);
645         }
646 }
647 /***********************************************************************/
648 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
649         try {
650                 seqDist next;
651                 int indexBefore = index;
652                 string first, second, distance;
653                 first = ""; second = ""; distance = "";
654                 int tabCount = 0;
655 //cout << "partial = " << partialDist << endl;          
656                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
657                         for (int i = 0; i < partialDist.size(); i++) {
658                                 if (tabCount == 0) {
659                                         if (partialDist[i] == '\t') { tabCount++; }
660                                         else {  first += partialDist[i];        }
661                                 }else if (tabCount == 1) {
662                                         if (partialDist[i] == '\t') { tabCount++; }
663                                         else {  second += partialDist[i];       }
664                                 }else if (tabCount == 2) {
665                                         distance +=  partialDist[i];
666                                 }
667                         }
668                         partialDist = "";
669                 }
670         
671                 //try to get another distance
672                 bool gotDist = false;
673                 while (index < size) {
674                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
675                                 gotDist = true;
676                                 
677                                 //gobble space
678                                 while (index < size) {          
679                                         if (isspace(buffer[index])) { index++; }
680                                         else { break; }         
681                                 }
682                                 break;
683                         }else{
684                                 if (tabCount == 0) {
685                                         if (buffer[index] == '\t') { tabCount++; }
686                                         else {  first += buffer[index]; }
687                                 }else if (tabCount == 1) {
688                                         if (buffer[index] == '\t') { tabCount++; }
689                                         else {  second += buffer[index];        }
690                                 }else if (tabCount == 2) {
691                                         distance +=  buffer[index];
692                                 }
693                                 index++;
694                         }
695                 }
696                 
697                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
698                 //then you want to save the partial distance.
699                 if (!gotDist) {
700                         for (int i = indexBefore; i < size; i++) {
701                                 partialDist += buffer[i];
702                         }
703                         index = size + 1;
704                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
705                 }else{
706                         int firstname, secondname;
707                         float dist;
708                         
709                         convert(first, firstname);
710                         convert(second, secondname);
711                         convert(distance, dist);
712                         
713                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
714                 }
715                                                 
716                 return next;
717         }
718         catch(exception& e) {
719                 errorOut(e, "HCluster", "getNextDist");
720                 exit(1);
721         }
722 }
723 /***********************************************************************/
724 void HCluster::processFile() {
725         try {
726                 string firstName, secondName;
727                 float distance;
728                 
729                 ifstream in;
730                 openInputFile(distfile, in);
731                 
732                 ofstream out;
733                 string outTemp = distfile + ".temp";
734                 openOutputFile(outTemp, out);
735         
736                 //get entry
737                 while (!in.eof()) {
738                         
739                         in >> firstName >> secondName >> distance;    gobble(in);               
740                         
741                         map<string,int>::iterator itA = nameMap->find(firstName);
742                         map<string,int>::iterator itB = nameMap->find(secondName);
743                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
744                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
745                 
746                         //using cutoff
747                         if (distance > cutoff) { break; }
748                 
749                         if (distance != -1) { //-1 means skip me
750                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
751                         }
752                 }
753                 
754                 in.close();
755                 out.close();
756                 
757                 remove(distfile.c_str());
758                 rename(outTemp.c_str(), distfile.c_str());
759         }
760         catch(exception& e) {
761                 errorOut(e, "HCluster", "processFile");
762                 exit(1);
763         }
764 }
765 /***********************************************************************/
766
767
768
769
770
771
772
773