]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
removed unused copy constructors and comments within comments that where causing...
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
17         try {
18                 m = MothurOut::getInstance();
19                 mapWanted = false;
20                 exitedBreak = false; 
21                 numSeqs = list->getNumSeqs();
22                 
23                 //initialize cluster array
24                 for (int i = 0; i < numSeqs; i++) {
25                         clusterNode temp(1, -1, i);
26                         clusterArray.push_back(temp);
27                 }
28                 
29                 if ((method == "furthest") || (method == "nearest")) {
30                         m->openInputFile(distfile, filehandle);
31                 }else{  
32                         processFile();  
33                 }
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "HCluster", "HCluster");
37                 exit(1);
38         }
39 }
40 /***********************************************************************/
41
42 void HCluster::clusterBins(){
43         try {
44                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
45
46                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
47                 rabund->set(clusterArray[smallRow].smallChild, 0);      
48                 rabund->setLabel(toString(smallDist));
49
50                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
51         }
52         catch(exception& e) {
53                 m->errorOut(e, "HCluster", "clusterBins");
54                 exit(1);
55         }
56
57
58 }
59
60 /***********************************************************************/
61
62 void HCluster::clusterNames(){
63         try {
64                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
65                 if (mapWanted) {  updateMap();  }
66                 
67                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
68                 list->set(clusterArray[smallRow].smallChild, "");       
69                 list->setLabel(toString(smallDist));
70         
71                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
72
73     }
74         catch(exception& e) {
75                 m->errorOut(e, "HCluster", "clusterNames");
76                 exit(1);
77         }
78
79 }
80 /***********************************************************************/
81 int HCluster::getUpmostParent(int node){
82         try {
83                 while (clusterArray[node].parent != -1) {
84                         node = clusterArray[node].parent;
85                 }
86                 
87                 return node;
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "HCluster", "getUpmostParent");
91                 exit(1);
92         }
93 }
94 /***********************************************************************/
95 void HCluster::printInfo(){
96         try {
97                 
98                 cout << "link table" << endl;
99                 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
100                         cout << itActive->first << " = " << itActive->second << endl;
101                 }
102                 cout << endl;
103                 for (int i = 0; i < linkTable.size(); i++) {
104                         cout << i << '\t';
105                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
106                                 cout << it->first << '-' << it->second << '\t' ;
107                         }
108                         cout << endl;
109                 }
110                 cout << endl << "clusterArray" << endl;
111                 
112                 for (int i = 0; i < clusterArray.size(); i++) {
113                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
114                 }
115                 cout << endl;
116                 
117                 
118         }
119         catch(exception& e) {
120                 m->errorOut(e, "HCluster", "getUpmostParent");
121                 exit(1);
122         }
123 }
124 /***********************************************************************/
125 int HCluster::makeActive() {
126         try {
127                 int linkValue = 1; 
128
129                 itActive = activeLinks.find(smallRow);
130                 it2Active = activeLinks.find(smallCol);
131                 
132                 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
133                         int size = linkTable.size();
134                         map<int, int> temp; map<int, int> temp2;
135                         
136                         //add link to eachother
137                         temp[smallRow] = 1;                                                     //         1    2                                                               
138                         temp2[smallCol] = 1;                                            // 1   0        1 
139                                                                                                                 // 2   1        0                               
140                         linkTable.push_back(temp);
141                         linkTable.push_back(temp2);
142                         
143                         //add to activeLinks
144                         activeLinks[smallRow] = size;
145                         activeLinks[smallCol] = size+1;
146
147                 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) {  //smallRow is active, smallCol is not
148                          int size = linkTable.size();
149                          int alreadyActiveRow = itActive->second;
150                          map<int, int> temp; 
151                         
152                         //add link to eachother
153                         temp[smallRow] = 1;                                                                     //         6    2       3       5
154                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
155                         linkTable[alreadyActiveRow][smallCol] = 1;                      // 2   1        0       1       1
156                                                                                                                                 // 3   2        1       0       0
157                                                                                                                                 // 5   0    1   0   0   
158                         //add to activeLinks
159                         activeLinks[smallCol] = size;
160         
161                 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) {  //smallCol is active, smallRow is not
162                          int size = linkTable.size();
163                          int alreadyActiveCol = it2Active->second;
164                          map<int, int> temp; 
165                         
166                         //add link to eachother
167                         temp[smallCol] = 1;                                                                     //         6    2       3       5
168                         linkTable.push_back(temp);                                                      // 6   0        1       2       0
169                         linkTable[alreadyActiveCol][smallRow] = 1;                      // 2   1        0       1       1
170                                                                                                                                 // 3   2        1       0       0
171                                                                                                                                 // 5   0    1   0   0   
172                         //add to activeLinks
173                         activeLinks[smallRow] = size;
174
175                 }else { //both are active so add one
176                         int row = itActive->second;
177                         int col = it2Active->second;
178                         
179                         
180                         linkTable[row][smallCol]++;
181                         linkTable[col][smallRow]++;
182                         linkValue = linkTable[row][smallCol];
183                 }
184                 
185                 return linkValue;
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "HCluster", "makeActive");
189                 exit(1);
190         }
191 }
192 /***********************************************************************/
193 void HCluster::updateArrayandLinkTable() {
194         try {
195                 //if cluster was made update clusterArray and linkTable
196                 int size = clusterArray.size();
197                 
198                 //add new node
199                 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
200                 clusterArray.push_back(temp);
201                 
202                 //update child nodes
203                 clusterArray[smallRow].parent = size;
204                 clusterArray[smallCol].parent = size;
205                 
206                 if (method == "furthest") {
207                         
208                         //update linkTable by merging clustered rows and columns
209                         int rowSpot = activeLinks[smallRow];
210                         int colSpot = activeLinks[smallCol];
211                         
212                         //fix old rows
213                         for (int i = 0; i < linkTable.size(); i++) {
214                                 //check if they are in map
215                                 it = linkTable[i].find(smallRow);
216                                 it2 = linkTable[i].find(smallCol);
217                                 
218                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
219                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
220                                         linkTable[i].erase(smallCol); //delete col row
221                                         linkTable[i].erase(smallRow); //delete col row
222                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
223                                         linkTable[i][size] = linkTable[i][smallCol];
224                                         linkTable[i].erase(smallCol); //delete col 
225                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
226                                         linkTable[i][size] = linkTable[i][smallRow];
227                                         linkTable[i].erase(smallRow); //delete col 
228                                 }
229                         }
230                         
231                         //merge their values
232                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
233                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
234                                 
235                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
236                                         linkTable[colSpot][it->first] = it->second;
237                                 }else { //merge them
238                                         linkTable[colSpot][it->first] = it->second + it2->second;
239                                 }
240                         }
241                         
242                         linkTable[colSpot].erase(size);
243                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
244                         
245                         //update activerows
246                         activeLinks.erase(smallRow);
247                         activeLinks.erase(smallCol);
248                         activeLinks[size] = colSpot;
249                         
250                         //adjust everybody elses spot since you deleted - time vs. space
251                         for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
252                                 if (itActive->second > rowSpot) {  activeLinks[itActive->first]--;      }
253                         }
254                 }
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
258                 exit(1);
259         }
260 }
261 /***********************************************************************/
262 double HCluster::update(int row, int col, float distance){
263         try {
264                 bool cluster = false;
265                 smallRow = row;
266                 smallCol = col;
267                 smallDist = distance;
268                 
269                 //find upmost parent of row and col
270                 smallRow = getUpmostParent(smallRow);
271                 smallCol = getUpmostParent(smallCol);
272
273                 //you don't want to cluster with yourself
274                 if (smallRow != smallCol) {
275                         
276                         if ((method == "furthest") || (method == "nearest")) {
277                                 //can we cluster???
278                                 if (method == "nearest") { cluster = true;  }
279                                 else{ //assume furthest
280                                         //are they active in the link table
281                                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
282                                         if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {             cluster = true;         }
283                                 }
284                                 
285                                 if (cluster) { 
286                                         updateArrayandLinkTable();
287                                         clusterBins();
288                                         clusterNames();
289                                 }
290                         }else {
291                                 cluster = true;
292                                 updateArrayandLinkTable();
293                                 clusterBins();
294                                 clusterNames();
295                                 combineFile();
296                         }
297                 }
298                 
299                 return cutoff;
300                 //printInfo();
301         }
302         catch(exception& e) {
303                 m->errorOut(e, "HCluster", "update");
304                 exit(1);
305         }
306 }
307 /***********************************************************************/
308 void HCluster::setMapWanted(bool ms)  {  
309         try {
310                 mapWanted = ms;
311                 
312                 //initialize map
313                 for (int i = 0; i < list->getNumBins(); i++) {
314                         
315                         //parse bin 
316                         string names = list->get(i);
317                         while (names.find_first_of(',') != -1) { 
318                                 //get name from bin
319                                 string name = names.substr(0,names.find_first_of(','));
320                                 //save name and bin number
321                                 seq2Bin[name] = i;
322                                 names = names.substr(names.find_first_of(',')+1, names.length());
323                         }
324                         
325                         //get last name
326                         seq2Bin[names] = i;
327                 }
328                 
329         }
330         catch(exception& e) {
331                 m->errorOut(e, "HCluster", "setMapWanted");
332                 exit(1);
333         }
334 }
335 /***********************************************************************/
336 void HCluster::updateMap() {
337 try {
338                 //update location of seqs in smallRow since they move to smallCol now
339                 string names = list->get(clusterArray[smallRow].smallChild);
340                 while (names.find_first_of(',') != -1) { 
341                         //get name from bin
342                         string name = names.substr(0,names.find_first_of(','));
343                         //save name and bin number
344                         seq2Bin[name] = clusterArray[smallCol].smallChild;
345                         names = names.substr(names.find_first_of(',')+1, names.length());
346                 }
347                         
348                 //get last name
349                 seq2Bin[names] = clusterArray[smallCol].smallChild;
350         }
351         catch(exception& e) {
352                 m->errorOut(e, "HCluster", "updateMap");
353                 exit(1);
354         }
355 }
356 //**********************************************************************************************************************
357 vector<seqDist> HCluster::getSeqs(){
358         try {
359                 vector<seqDist> sameSeqs;
360                 
361                 if ((method == "furthest") || (method == "nearest")) {
362                         sameSeqs = getSeqsFNNN();
363                 }else{
364                         sameSeqs = getSeqsAN(); 
365                 }
366                                 
367                 return sameSeqs;
368         }
369         catch(exception& e) {
370                 m->errorOut(e, "HCluster", "getSeqs");
371                 exit(1);
372         }
373 }
374 //**********************************************************************************************************************
375 vector<seqDist> HCluster::getSeqsFNNN(){
376         try {
377                 string firstName, secondName;
378                 float distance, prevDistance;
379                 vector<seqDist> sameSeqs;
380                 prevDistance = -1;
381         
382                 //if you are not at the beginning of the file
383                 if (exitedBreak) { 
384                         sameSeqs.push_back(next);
385                         prevDistance = next.dist;
386                         exitedBreak = false;
387                 }
388         
389                 //get entry
390                 while (!filehandle.eof()) {
391                         
392                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
393         
394                         //save first one
395                         if (prevDistance == -1) { prevDistance = distance; }
396                         
397                         map<string,int>::iterator itA = nameMap->find(firstName);
398                         map<string,int>::iterator itB = nameMap->find(secondName);
399                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
400                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
401                 
402                         //using cutoff
403                         if (distance > cutoff) { break; }
404                 
405                         if (distance != -1) { //-1 means skip me
406                         
407                                 //are the distances the same
408                                 if (distance == prevDistance) { //save in vector
409                                         seqDist temp(itA->second, itB->second, distance);
410                                         sameSeqs.push_back(temp);
411                                         exitedBreak = false;
412                                 }else{ 
413                                         next.seq1 = itA->second;
414                                         next.seq2 = itB->second;
415                                         next.dist = distance;
416                                         exitedBreak = true;
417                                         break;
418                                 }
419                         }
420                 }
421                 
422                 //rndomize matching dists
423                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
424                 
425                 return sameSeqs;
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "HCluster", "getSeqsFNNN");
429                 exit(1);
430         }
431 }
432 //**********************************************************************************************************************
433 vector<seqDist> HCluster::getSeqsAN(){
434         try {
435                 int firstName, secondName;
436                 float prevDistance;
437                 vector<seqDist> sameSeqs;
438                 prevDistance = -1;
439                 
440                 m->openInputFile(distfile, filehandle, "no error"); 
441                 
442                 //is the smallest value in mergedMin or the distfile?
443                 float mergedMinDist = 10000;
444                 float distance = 10000;
445                 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist;  }
446                         
447                 if (!filehandle.eof()) {  
448                         filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle);
449                         //save first one
450                         if (prevDistance == -1) { prevDistance = distance; } 
451                         if (distance != -1) { //-1 means skip me
452                                 seqDist temp(firstName, secondName, distance);
453                                 sameSeqs.push_back(temp);
454                         }else{ distance = 10000; }
455                 }
456                 
457                 if (mergedMinDist < distance) { //get minimum distance from mergedMin
458                         //remove distance we saved from file
459                         sameSeqs.clear();
460                         prevDistance = mergedMinDist;
461                         
462                         for (int i = 0; i < mergedMin.size(); i++) {
463                                 if (mergedMin[i].dist == prevDistance) {
464                                         sameSeqs.push_back(mergedMin[i]);
465                                 }else { break; }
466                         }
467                 }else{ //get minimum from file
468                         //get entry
469                         while (!filehandle.eof()) {
470                                 
471                                 filehandle >> firstName >> secondName >> distance;    m->gobble(filehandle); 
472                                 
473                                 if (prevDistance == -1) { prevDistance = distance; }
474                                 
475                                 if (distance != -1) { //-1 means skip me
476                                         //are the distances the same
477                                         if (distance == prevDistance) { //save in vector
478                                                 seqDist temp(firstName, secondName, distance);
479                                                 sameSeqs.push_back(temp);
480                                         }else{  
481                                                 break;
482                                         }
483                                 }
484                         }
485                 }
486                 filehandle.close();
487                 
488                 //randomize matching dists
489                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
490                 
491                 //can only return one value since once these are merged the other distances in sameSeqs may have changed
492                 vector<seqDist> temp;
493                 if (sameSeqs.size() > 0) {  temp.push_back(sameSeqs[0]);  }
494                 
495                 return temp;
496         }
497         catch(exception& e) {
498                 m->errorOut(e, "HCluster", "getSeqsAN");
499                 exit(1);
500         }
501 }
502
503 /***********************************************************************/
504 int HCluster::combineFile() {
505         try {
506                 //int bufferSize = 64000;  //512k - this should be a variable that the user can set to optimize code to their hardware
507                 //char* inputBuffer;
508                 //inputBuffer = new char[bufferSize];
509                 //size_t numRead;
510                 
511                 string tempDistFile = distfile + ".temp";
512                 ofstream out;
513                 m->openOutputFile(tempDistFile, out);
514                 
515                 //FILE* in;
516                 //in = fopen(distfile.c_str(), "rb");
517         
518                 ifstream in;
519                 m->openInputFile(distfile, in, "no error");
520                 
521                 int first, second;
522                 float dist;
523                 
524                 vector< map<int, float> > smallRowColValues;
525                 smallRowColValues.resize(2);  //0 = row, 1 = col
526                 int count = 0;
527                                 
528                 //go through file pulling out distances related to rows merging
529                 //if mergedMin contains distances add those back into file
530                 //bool done = false;
531                 //partialDist = "";
532                 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
533 //cout << "number of char read = " << numRead << endl;
534 //cout << inputBuffer << endl;
535                         //if (numRead < bufferSize) { done = true; }
536                         
537                         //parse input into individual distances
538                         //int spot = 0;
539                         //string outputString = "";
540                         //while(spot < numRead) {
541         //cout << "spot = " << spot << endl;
542                          //  seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
543                            
544                            //you read a partial distance
545                           // if (nextDist.seq1 == -1) { break;  }
546                         while (!in.eof()) {
547                                 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
548         //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;                   
549                            //since file is sorted and mergedMin is sorted 
550                            //you can put the smallest distance from each through the code below and keep the file sorted
551                            
552                            in >> first >> second >> dist; m->gobble(in);
553                            
554                            if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
555                            
556                            //while there are still values in mergedMin that are smaller than the distance read from file
557                            while (count < mergedMin.size())  {
558                            
559                                         //is the distance in mergedMin smaller than from the file
560                                         if (mergedMin[count].dist < dist) {
561                                         //is this a distance related to the columns merging?
562                                         //if yes, save in memory
563                                                 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
564                                                 }else if (mergedMin[count].seq1 == smallCol) {
565                                                         smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
566                                                 }else if (mergedMin[count].seq2 == smallCol) {
567                                                         smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
568                                                 }else if (mergedMin[count].seq1 == smallRow) {
569                                                         smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
570                                                 }else if (mergedMin[count].seq2 == smallRow) {
571                                                         smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
572                                                 }else { //if no, write to temp file
573                                                         //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
574                                                         //if (mergedMin[count].dist < cutoff) { 
575                                                                 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
576                                                         //}
577                                                 }
578                                                 count++;
579                                         }else{   break; }
580                            }
581                            
582                            //is this a distance related to the columns merging?
583                            //if yes, save in memory
584                            if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
585                            }else if (first == smallCol) {
586                                         smallRowColValues[1][second] = dist;
587                            }else if (second == smallCol) {
588                                         smallRowColValues[1][first] = dist;
589                            }else if (first == smallRow) {
590                                         smallRowColValues[0][second] = dist;
591                            }else if (second == smallRow) {
592                                         smallRowColValues[0][first] = dist;
593                            
594                            }else { //if no, write to temp file
595                                         //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
596                                    //if (dist < cutoff) {
597                                            out << first << '\t' << second << '\t' << dist << endl;
598                                    //}
599                            }
600                         }
601                         
602                         //out << outputString;
603                         //if(done) { break; }
604                 //}
605                 //fclose(in);
606                 in.close();
607                 
608                 //if values in mergedMin are larger than the the largest in file then
609                 while (count < mergedMin.size())  {  
610                         //is this a distance related to the columns merging?
611                         //if yes, save in memory
612                         if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
613                         }else if (mergedMin[count].seq1 == smallCol) {
614                                 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
615                         }else if (mergedMin[count].seq2 == smallCol) {
616                                 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
617                         }else if (mergedMin[count].seq1 == smallRow) {
618                                 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
619                         }else if (mergedMin[count].seq2 == smallRow) {
620                                 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
621                                 
622                         }else { //if no, write to temp file
623                                 //if (mergedMin[count].dist < cutoff) {
624                                         out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
625                                 //}
626                         }
627                         count++;
628                 }
629                 out.close();
630                 mergedMin.clear();
631                         
632                 //rename tempfile to distfile
633                 m->mothurRemove(distfile);
634                 rename(tempDistFile.c_str(), distfile.c_str());
635 //cout << "remove = "<< renameOK << " rename = " << ok << endl; 
636
637                 //merge clustered rows averaging the distances
638                 map<int, float>::iterator itMerge;
639                 map<int, float>::iterator it2Merge;
640                 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {                 
641                         //does smallRowColValues[1] have a distance to this seq too?
642                         it2Merge = smallRowColValues[1].find(itMerge->first);
643                         
644                         float average;
645                         if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
646                                 //average
647                                 if (method == "average") {
648                                         int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
649                                         average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
650                                 }else { //weighted
651                                         average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;                           
652                                 }
653                                 
654                                 smallRowColValues[1].erase(it2Merge);
655                                 
656                                 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
657                                 mergedMin.push_back(temp);
658                         }else {  
659                                 //can't find value so update cutoff
660                                 if (cutoff > itMerge->second) { cutoff = itMerge->second; }
661                         }
662                 }
663                 
664                 //update cutoff
665                 for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) { 
666                         if (cutoff > itMerge->second) { cutoff = itMerge->second; }
667                 }
668                 
669                 //sort merged values
670                 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);      
671                 
672                 return 0;
673         }
674         catch(exception& e) {
675                 m->errorOut(e, "HCluster", "combineFile");
676                 exit(1);
677         }
678 }
679 /***********************************************************************
680 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
681         try {
682                 seqDist next;
683                 int indexBefore = index;
684                 string first, second, distance;
685                 first = ""; second = ""; distance = "";
686                 int tabCount = 0;
687 //cout << "partial = " << partialDist << endl;          
688                 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
689                         for (int i = 0; i < partialDist.size(); i++) {
690                                 if (tabCount == 0) {
691                                         if (partialDist[i] == '\t') { tabCount++; }
692                                         else {  first += partialDist[i];        }
693                                 }else if (tabCount == 1) {
694                                         if (partialDist[i] == '\t') { tabCount++; }
695                                         else {  second += partialDist[i];       }
696                                 }else if (tabCount == 2) {
697                                         distance +=  partialDist[i];
698                                 }
699                         }
700                         partialDist = "";
701                 }
702         
703                 //try to get another distance
704                 bool gotDist = false;
705                 while (index < size) {
706                         if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
707                                 gotDist = true;
708                                 
709                                 //m->gobble space
710                                 while (index < size) {          
711                                         if (isspace(buffer[index])) { index++; }
712                                         else { break; }         
713                                 }
714                                 break;
715                         }else{
716                                 if (tabCount == 0) {
717                                         if (buffer[index] == '\t') { tabCount++; }
718                                         else {  first += buffer[index]; }
719                                 }else if (tabCount == 1) {
720                                         if (buffer[index] == '\t') { tabCount++; }
721                                         else {  second += buffer[index];        }
722                                 }else if (tabCount == 2) {
723                                         distance +=  buffer[index];
724                                 }
725                                 index++;
726                         }
727                 }
728                 
729                 //there was not a whole distance in the buffer, ie. buffer = "1 2       0.01    2       3       0."
730                 //then you want to save the partial distance.
731                 if (!gotDist) {
732                         for (int i = indexBefore; i < size; i++) {
733                                 partialDist += buffer[i];
734                         }
735                         index = size + 1;
736                         next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
737                 }else{
738                         int firstname, secondname;
739                         float dist;
740                         
741                         convert(first, firstname);
742                         convert(second, secondname);
743                         convert(distance, dist);
744                         
745                         next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
746                 }
747                                                 
748                 return next;
749         }
750         catch(exception& e) {
751                 m->errorOut(e, "HCluster", "getNextDist");
752                 exit(1);
753         }
754 }
755 ***********************************************************************/
756 int HCluster::processFile() {
757         try {
758                 string firstName, secondName;
759                 float distance;
760                 
761                 ifstream in;
762                 m->openInputFile(distfile, in, "no error");
763                 
764                 ofstream out;
765                 string outTemp = distfile + ".temp";
766                 m->openOutputFile(outTemp, out);
767         
768                 //get entry
769                 while (!in.eof()) {
770                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
771                         
772                         in >> firstName >> secondName >> distance;    m->gobble(in);            
773                         
774                         map<string,int>::iterator itA = nameMap->find(firstName);
775                         map<string,int>::iterator itB = nameMap->find(secondName);
776                         if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
777                         if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
778                 
779                         //using cutoff
780                         if (distance > cutoff) { break; }
781                 
782                         if (distance != -1) { //-1 means skip me
783                                 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
784                         }
785                 }
786                 
787                 in.close();
788                 out.close();
789                 
790                 m->mothurRemove(distfile);
791                 rename(outTemp.c_str(), distfile.c_str());
792                 
793                 return 0;
794         }
795         catch(exception& e) {
796                 m->errorOut(e, "HCluster", "processFile");
797                 exit(1);
798         }
799 }
800 /***********************************************************************/
801
802
803
804
805
806
807
808