]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
working on pam
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13
14 /***********************************************************************/
15 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
16         try {
17                 m = MothurOut::getInstance();
18                 mapWanted = false;
19                 exitedBreak = false; 
20                 numSeqs = list->getNumSeqs();
21                 
22                 //initialize cluster array
23                 for (int i = 0; i < numSeqs; i++) {
24                         clusterNode temp(1, -1, i);
25                         clusterArray.push_back(temp);
26                 }
27                 
28                 if ((method == "furthest") || (method == "nearest")) {
29                         m->openInputFile(distfile, filehandle);
30                 }else{  
31                         processFile();  
32                 }
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "HCluster", "HCluster");
36                 exit(1);
37         }
38 }
39 /***********************************************************************/
40
41 void HCluster::clusterBins(){
42         try {
43                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
44
45                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
46                 rabund->set(clusterArray[smallRow].smallChild, 0);      
47                 rabund->setLabel(toString(smallDist));
48
49                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "HCluster", "clusterBins");
53                 exit(1);
54         }
55
56
57 }
58
59 /***********************************************************************/
60
61 void HCluster::clusterNames(){
62         try {
63                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
64                 if (mapWanted) {  updateMap();  }
65                 
66                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
67                 list->set(clusterArray[smallRow].smallChild, "");       
68                 list->setLabel(toString(smallDist));
69         
70                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
71
72     }
73         catch(exception& e) {
74                 m->errorOut(e, "HCluster", "clusterNames");
75                 exit(1);
76         }
77
78 }
79 /***********************************************************************/
80 int HCluster::getUpmostParent(int node){
81         try {
82                 while (clusterArray[node].parent != -1) {
83                         node = clusterArray[node].parent;
84                 }
85                 
86                 return node;
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "HCluster", "getUpmostParent");
90                 exit(1);
91         }
92 }
93 /***********************************************************************/
94 void HCluster::printInfo(){
95         try {
96                 
97                 cout << "link table" << endl;
98                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
99                         cout << itActive->first << " = " << itActive->second << endl;
100                 }
101                 cout << endl;
102                 for (int i = 0; i < linkTable.size(); i++) {
103                         cout << i << '\t';
104                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
105                                 cout << it->first << '-' << it->second << '\t' ;
106                         }
107                         cout << endl;
108                 }
109                 cout << endl << "clusterArray" << endl;
110                 
111                 for (int i = 0; i < clusterArray.size(); i++) {
112                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
113                 }
114                 cout << endl;
115                 
116                 
117         }
118         catch(exception& e) {
119                 m->errorOut(e, "HCluster", "getUpmostParent");
120                 exit(1);
121         }
122 }
123 /***********************************************************************/
124 int HCluster::makeActive() {
125         try {
126                 int linkValue = 1; 
127
128                 itActive = activeLinks.find(smallRow);
129                 it2Active = activeLinks.find(smallCol);
130                 
131                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
132                         int size = linkTable.size();
133                         map<int, int> temp; map<int, int> temp2;
134                         
135                         //add link to eachother
136                         temp[smallRow] = 1;                                                     //         1    2                                                               
137                         temp2[smallCol] = 1;                                            // 1   0        1 
138                                                                                                                 // 2   1        0                               
139                         linkTable.push_back(temp);
140                         linkTable.push_back(temp2);
141                         
142                         //add to activeLinks
143                         activeLinks[smallRow] = size;
144                         activeLinks[smallCol] = size+1;
145
146                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
147                          int size = linkTable.size();
148                          int alreadyActiveRow = itActive->second;
149                          map<int, int> temp; 
150                         
151                         //add link to eachother
152                         temp[smallRow] = 1;                                                                     //         6    2       3       5
153                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
154                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
155                                                                                                                                 // 3   2        1       0       0
156                                                                                                                                 // 5   0    1   0   0   
157                         //add to activeLinks
158                         activeLinks[smallCol] = size;
159         
160                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
161                          int size = linkTable.size();
162                          int alreadyActiveCol = it2Active->second;
163                          map<int, int> temp; 
164                         
165                         //add link to eachother
166                         temp[smallCol] = 1;                                                                     //         6    2       3       5
167                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
168                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
169                                                                                                                                 // 3   2        1       0       0
170                                                                                                                                 // 5   0    1   0   0   
171                         //add to activeLinks
172                         activeLinks[smallRow] = size;
173
174                 }else { //both are active so add one
175                         int row = itActive->second;
176                         int col = it2Active->second;
177                         
178                         
179                         linkTable[row][smallCol]++;
180                         linkTable[col][smallRow]++;
181                         linkValue = linkTable[row][smallCol];
182                 }
183                 
184                 return linkValue;
185         }
186         catch(exception& e) {
187                 m->errorOut(e, "HCluster", "makeActive");
188                 exit(1);
189         }
190 }
191 /***********************************************************************/
192 void HCluster::updateArrayandLinkTable() {
193         try {
194                 //if cluster was made update clusterArray and linkTable
195                 int size = clusterArray.size();
196                 
197                 //add new node
198                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
199                 clusterArray.push_back(temp);
200                 
201                 //update child nodes
202                 clusterArray[smallRow].parent = size;
203                 clusterArray[smallCol].parent = size;
204                 
205                 if (method == "furthest") {
206                         
207                         //update linkTable by merging clustered rows and columns
208                         int rowSpot = activeLinks[smallRow];
209                         int colSpot = activeLinks[smallCol];
210                         
211                         //fix old rows
212                         for (int i = 0; i < linkTable.size(); i++) {
213                                 //check if they are in map
214                                 it = linkTable[i].find(smallRow);
215                                 it2 = linkTable[i].find(smallCol);
216                                 
217                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
218                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
219                                         linkTable[i].erase(smallCol); //delete col row
220                                         linkTable[i].erase(smallRow); //delete col row
221                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
222                                         linkTable[i][size] = linkTable[i][smallCol];
223                                         linkTable[i].erase(smallCol); //delete col 
224                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
225                                         linkTable[i][size] = linkTable[i][smallRow];
226                                         linkTable[i].erase(smallRow); //delete col 
227                                 }
228                         }
229                         
230                         //merge their values
231                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
232                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
233                                 
234                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
235                                         linkTable[colSpot][it->first] = it->second;
236                                 }else { //merge them
237                                         linkTable[colSpot][it->first] = it->second + it2->second;
238                                 }
239                         }
240                         
241                         linkTable[colSpot].erase(size);
242                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
243                         
244                         //update activerows
245                         activeLinks.erase(smallRow);
246                         activeLinks.erase(smallCol);
247                         activeLinks[size] = colSpot;
248                         
249                         //adjust everybody elses spot since you deleted - time vs. space
250                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
251                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
252                         }
253                 }
254         }
255         catch(exception& e) {
256                 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
257                 exit(1);
258         }
259 }
260 /***********************************************************************/
261 double HCluster::update(int row, int col, float distance){
262         try {
263                 bool cluster = false;
264                 smallRow = row;
265                 smallCol = col;
266                 smallDist = distance;
267                 
268                 //find upmost parent of row and col
269                 smallRow = getUpmostParent(smallRow);
270                 smallCol = getUpmostParent(smallCol);
271
272                 //you don't want to cluster with yourself
273                 if (smallRow != smallCol) {
274                         
275                         if ((method == "furthest") || (method == "nearest")) {
276                                 //can we cluster???
277                                 if (method == "nearest") { cluster = true;  }
278                                 else{ //assume furthest
279                                         //are they active in the link table
280                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
281                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
282                                 }
283                                 
284                                 if (cluster) { 
285                                         updateArrayandLinkTable();
286                                         clusterBins();
287                                         clusterNames();
288                                 }
289                         }else {
290                                 cluster = true;
291                                 updateArrayandLinkTable();
292                                 clusterBins();
293                                 clusterNames();
294                                 combineFile();
295                         }
296                 }
297                 
298                 return cutoff;
299                 //printInfo();
300         }
301         catch(exception& e) {
302                 m->errorOut(e, "HCluster", "update");
303                 exit(1);
304         }
305 }
306 /***********************************************************************/
307 void HCluster::setMapWanted(bool ms)  {  
308         try {
309                 mapWanted = ms;
310                 
311                 //initialize map
312                 for (int i = 0; i < list->getNumBins(); i++) {
313                         
314                         //parse bin 
315                         string names = list->get(i);
316                         while (names.find_first_of(',') != -1) { 
317                                 //get name from bin
318                                 string name = names.substr(0,names.find_first_of(','));
319                                 //save name and bin number
320                                 seq2Bin[name] = i;
321                                 names = names.substr(names.find_first_of(',')+1, names.length());
322                         }
323                         
324                         //get last name
325                         seq2Bin[names] = i;
326                 }
327                 
328         }
329         catch(exception& e) {
330                 m->errorOut(e, "HCluster", "setMapWanted");
331                 exit(1);
332         }
333 }
334 /***********************************************************************/
335 void HCluster::updateMap() {
336 try {
337                 //update location of seqs in smallRow since they move to smallCol now
338                 string names = list->get(clusterArray[smallRow].smallChild);
339                 while (names.find_first_of(',') != -1) { 
340                         //get name from bin
341                         string name = names.substr(0,names.find_first_of(','));
342                         //save name and bin number
343                         seq2Bin[name] = clusterArray[smallCol].smallChild;
344                         names = names.substr(names.find_first_of(',')+1, names.length());
345                 }
346                         
347                 //get last name
348                 seq2Bin[names] = clusterArray[smallCol].smallChild;
349         }
350         catch(exception& e) {
351                 m->errorOut(e, "HCluster", "updateMap");
352                 exit(1);
353         }
354 }
355 //**********************************************************************************************************************
356 vector<seqDist> HCluster::getSeqs(){
357         try {
358                 vector<seqDist> sameSeqs;
359                 
360                 if ((method == "furthest") || (method == "nearest")) {
361                         sameSeqs = getSeqsFNNN();
362                 }else{
363                         sameSeqs = getSeqsAN(); 
364                 }
365                                 
366                 return sameSeqs;
367         }
368         catch(exception& e) {
369                 m->errorOut(e, "HCluster", "getSeqs");
370                 exit(1);
371         }
372 }
373 //**********************************************************************************************************************
374 vector<seqDist> HCluster::getSeqsFNNN(){
375         try {
376                 string firstName, secondName;
377                 float distance, prevDistance;
378                 vector<seqDist> sameSeqs;
379                 prevDistance = -1;
380         
381                 //if you are not at the beginning of the file
382                 if (exitedBreak) { 
383                         sameSeqs.push_back(next);
384                         prevDistance = next.dist;
385                         exitedBreak = false;
386                 }
387         
388                 //get entry
389                 while (!filehandle.eof()) {
390                         
391                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
392         
393                         //save first one
394                         if (prevDistance == -1) { prevDistance = distance; }
395                         
396                         map<string,int>::iterator itA = nameMap->find(firstName);
397                         map<string,int>::iterator itB = nameMap->find(secondName);
398                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
399                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
400                 
401                         //using cutoff
402                         if (distance > cutoff) { break; }
403                 
404                         if (distance != -1) { //-1 means skip me
405                         
406                                 //are the distances the same
407                                 if (distance == prevDistance) { //save in vector
408                                         seqDist temp(itA->second, itB->second, distance);
409                                         sameSeqs.push_back(temp);
410                                         exitedBreak = false;
411                                 }else{ 
412                                         next.seq1 = itA->second;
413                                         next.seq2 = itB->second;
414                                         next.dist = distance;
415                                         exitedBreak = true;
416                                         break;
417                                 }
418                         }
419                 }
420                 
421                 //rndomize matching dists
422                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
423                 
424                 return sameSeqs;
425         }
426         catch(exception& e) {
427                 m->errorOut(e, "HCluster", "getSeqsFNNN");
428                 exit(1);
429         }
430 }
431 //**********************************************************************************************************************
432 vector<seqDist> HCluster::getSeqsAN(){
433         try {
434                 int firstName, secondName;
435                 float prevDistance;
436                 vector<seqDist> sameSeqs;
437                 prevDistance = -1;
438                 
439                 m->openInputFile(distfile, filehandle, "no error"); 
440                 
441                 //is the smallest value in mergedMin or the distfile?
442                 float mergedMinDist = 10000;
443                 float distance = 10000;
444                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
445                         
446                 if (!filehandle.eof()) {  
447                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
448                         //save first one
449                         if (prevDistance == -1) { prevDistance = distance; } 
450                         if (distance != -1) { //-1 means skip me
451                                 seqDist temp(firstName, secondName, distance);
452                                 sameSeqs.push_back(temp);
453                         }else{ distance = 10000; }
454                 }
455                 
456                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
457                         //remove distance we saved from file
458                         sameSeqs.clear();
459                         prevDistance = mergedMinDist;
460                         
461                         for (int i = 0; i < mergedMin.size(); i++) {
462                                 if (mergedMin[i].dist == prevDistance) {
463                                         sameSeqs.push_back(mergedMin[i]);
464                                 }else { break; }
465                         }
466                 }else{ //get minimum from file
467                         //get entry
468                         while (!filehandle.eof()) {
469                                 
470                                 filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
471                                 
472                                 if (prevDistance == -1) { prevDistance = distance; }
473                                 
474                                 if (distance != -1) { //-1 means skip me
475                                         //are the distances the same
476                                         if (distance == prevDistance) { //save in vector
477                                                 seqDist temp(firstName, secondName, distance);
478                                                 sameSeqs.push_back(temp);
479                                         }else{  
480                                                 break;
481                                         }
482                                 }
483                         }
484                 }
485                 filehandle.close();
486                 
487                 //randomize matching dists
488                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
489                 
490                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
491                 vector<seqDist> temp;
492                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
493                 
494                 return temp;
495         }
496         catch(exception& e) {
497                 m->errorOut(e, "HCluster", "getSeqsAN");
498                 exit(1);
499         }
500 }
501
502 /***********************************************************************/
503 int HCluster::combineFile() {
504         try {
505                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
506                 //char* inputBuffer;
507                 //inputBuffer = new char[bufferSize];
508                 //size_t numRead;
509                 
510                 string tempDistFile = distfile + ".temp";
511                 ofstream out;
512                 m->openOutputFile(tempDistFile, out);
513                 
514                 //FILE* in;
515                 //in = fopen(distfile.c_str(), "rb");
516         
517                 ifstream in;
518                 m->openInputFile(distfile, in, "no error");
519                 
520                 int first, second;
521                 float dist;
522                 
523                 vector< map<int, float> > smallRowColValues;
524                 smallRowColValues.resize(2);  //0 = row, 1 = col
525                 int count = 0;
526                                 
527                 //go through file pulling out distances related to rows merging
528                 //if mergedMin contains distances add those back into file
529                 //bool done = false;
530                 //partialDist = "";
531                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
532 //cout << "number of char read = " << numRead << endl;
533 //cout << inputBuffer << endl;
534                         //if (numRead < bufferSize) { done = true; }
535                         
536                         //parse input into individual distances
537                         //int spot = 0;
538                         //string outputString = "";
539                         //while(spot < numRead) {
540         //cout << "spot = " << spot << endl;
541                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
542                            
543                            //you read a partial distance
544                           // if (nextDist.seq1 == -1) { break;  }
545                         while (!in.eof()) {
546                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
547         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
548                            //since file is sorted and mergedMin is sorted 
549                            //you can put the smallest distance from each through the code below and keep the file sorted
550                            
551                            in >> first >> second >> dist; m->gobble(in);
552                            
553                            if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
554                            
555                            //while there are still values in mergedMin that are smaller than the distance read from file
556                            while (count < mergedMin.size())  {
557                            
558                                         //is the distance in mergedMin smaller than from the file
559                                         if (mergedMin[count].dist < dist) {
560                                         //is this a distance related to the columns merging?
561                                         //if yes, save in memory
562                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
563                                                 }else if (mergedMin[count].seq1 == smallCol) {
564                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
565                                                 }else if (mergedMin[count].seq2 == smallCol) {
566                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
567                                                 }else if (mergedMin[count].seq1 == smallRow) {
568                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
569                                                 }else if (mergedMin[count].seq2 == smallRow) {
570                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
571                                                 }else { //if no, write to temp file
572                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
573                                                         //if (mergedMin[count].dist < cutoff) { 
574                                                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
575                                                         //}
576                                                 }
577                                                 count++;
578                                         }else{   break; }
579                            }
580                            
581                            //is this a distance related to the columns merging?
582                            //if yes, save in memory
583                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
584                            }else if (first == smallCol) {
585                                         smallRowColValues[1][second] = dist;
586                            }else if (second == smallCol) {
587                                         smallRowColValues[1][first] = dist;
588                            }else if (first == smallRow) {
589                                         smallRowColValues[0][second] = dist;
590                            }else if (second == smallRow) {
591                                         smallRowColValues[0][first] = dist;
592                            
593                            }else { //if no, write to temp file
594                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
595                                    //if (dist < cutoff) {
596                                            out << first << '\t' << second << '\t' << dist << endl;
597                                    //}
598                            }
599                         }
600                         
601                         //out << outputString;
602                         //if(done) { break; }
603                 //}
604                 //fclose(in);
605                 in.close();
606                 
607                 //if values in mergedMin are larger than the the largest in file then
608                 while (count < mergedMin.size())  {  
609                         //is this a distance related to the columns merging?
610                         //if yes, save in memory
611                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
612                         }else if (mergedMin[count].seq1 == smallCol) {
613                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
614                         }else if (mergedMin[count].seq2 == smallCol) {
615                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
616                         }else if (mergedMin[count].seq1 == smallRow) {
617                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
618                         }else if (mergedMin[count].seq2 == smallRow) {
619                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
620                                 
621                         }else { //if no, write to temp file
622                                 //if (mergedMin[count].dist < cutoff) {
623                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
624                                 //}
625                         }
626                         count++;
627                 }
628                 out.close();
629                 mergedMin.clear();
630                         
631                 //rename tempfile to distfile
632                 m->mothurRemove(distfile);
633                 rename(tempDistFile.c_str(), distfile.c_str());
634 //cout << "remove = "<< renameOK << " rename = " << ok << endl; 
635
636                 //merge clustered rows averaging the distances
637                 map<int, float>::iterator itMerge;
638                 map<int, float>::iterator it2Merge;
639                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
640                         //does smallRowColValues[1] have a distance to this seq too?
641                         it2Merge = smallRowColValues[1].find(itMerge->first);
642                         
643                         float average;
644                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
645                                 //average
646                                 if (method == "average") {
647                                         int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
648                                         average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
649                                 }else { //weighted
650                                         average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;                           
651                                 }
652                                 
653                                 smallRowColValues[1].erase(it2Merge);
654                                 
655                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
656                                 mergedMin.push_back(temp);
657                         }else {  
658                                 //can't find value so update cutoff
659                                 if (cutoff > itMerge->second) { cutoff = itMerge->second; }
660                         }
661                 }
662                 
663                 //update cutoff
664                 for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) { 
665                         if (cutoff > itMerge->second) { cutoff = itMerge->second; }
666                 }
667                 
668                 //sort merged values
669                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
670                 
671                 return 0;
672         }
673         catch(exception& e) {
674                 m->errorOut(e, "HCluster", "combineFile");
675                 exit(1);
676         }
677 }
678 /***********************************************************************
679 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
680         try {
681                 seqDist next;
682                 int indexBefore = index;
683                 string first, second, distance;
684                 first = ""; second = ""; distance = "";
685                 int tabCount = 0;
686 //cout << "partial = " << partialDist << endl;          
687                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
688                         for (int i = 0; i < partialDist.size(); i++) {
689                                 if (tabCount == 0) {
690                                         if (partialDist[i] == '\t') { tabCount++; }
691                                         else {  first += partialDist[i];        }
692                                 }else if (tabCount == 1) {
693                                         if (partialDist[i] == '\t') { tabCount++; }
694                                         else {  second += partialDist[i];       }
695                                 }else if (tabCount == 2) {
696                                         distance +=  partialDist[i];
697                                 }
698                         }
699                         partialDist = "";
700                 }
701         
702                 //try to get another distance
703                 bool gotDist = false;
704                 while (index < size) {
705                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
706                                 gotDist = true;
707                                 
708                                 //m->gobble space
709                                 while (index < size) {          
710                                         if (isspace(buffer[index])) { index++; }
711                                         else { break; }         
712                                 }
713                                 break;
714                         }else{
715                                 if (tabCount == 0) {
716                                         if (buffer[index] == '\t') { tabCount++; }
717                                         else {  first += buffer[index]; }
718                                 }else if (tabCount == 1) {
719                                         if (buffer[index] == '\t') { tabCount++; }
720                                         else {  second += buffer[index];        }
721                                 }else if (tabCount == 2) {
722                                         distance +=  buffer[index];
723                                 }
724                                 index++;
725                         }
726                 }
727                 
728                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
729                 //then you want to save the partial distance.
730                 if (!gotDist) {
731                         for (int i = indexBefore; i < size; i++) {
732                                 partialDist += buffer[i];
733                         }
734                         index = size + 1;
735                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
736                 }else{
737                         int firstname, secondname;
738                         float dist;
739                         
740                         convert(first, firstname);
741                         convert(second, secondname);
742                         convert(distance, dist);
743                         
744                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
745                 }
746                                                 
747                 return next;
748         }
749         catch(exception& e) {
750                 m->errorOut(e, "HCluster", "getNextDist");
751                 exit(1);
752         }
753 }
754 ***********************************************************************/
755 int HCluster::processFile() {
756         try {
757                 string firstName, secondName;
758                 float distance;
759                 
760                 ifstream in;
761                 m->openInputFile(distfile, in, "no error");
762                 
763                 ofstream out;
764                 string outTemp = distfile + ".temp";
765                 m->openOutputFile(outTemp, out);
766         
767                 //get entry
768                 while (!in.eof()) {
769                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
770                         
771                         in >> firstName >> secondName >> distance;    m->gobble(in);            
772                         
773                         map<string,int>::iterator itA = nameMap->find(firstName);
774                         map<string,int>::iterator itB = nameMap->find(secondName);
775                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
776                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
777                 
778                         //using cutoff
779                         if (distance > cutoff) { break; }
780                 
781                         if (distance != -1) { //-1 means skip me
782                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
783                         }
784                 }
785                 
786                 in.close();
787                 out.close();
788                 
789                 m->mothurRemove(distfile);
790                 rename(outTemp.c_str(), distfile.c_str());
791                 
792                 return 0;
793         }
794         catch(exception& e) {
795                 m->errorOut(e, "HCluster", "processFile");
796                 exit(1);
797         }
798 }
799 /***********************************************************************/
800
801
802
803
804
805
806
807