5 * Created by westcott on 10/13/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
18 m = MothurOut::getInstance();
21 numSeqs = list->getNumSeqs();
23 //initialize cluster array
24 for (int i = 0; i < numSeqs; i++) {
25 clusterNode temp(1, -1, i);
26 clusterArray.push_back(temp);
29 if (method != "average") {
30 openInputFile(distfile, filehandle);
36 m->errorOut(e, "HCluster", "HCluster");
40 /***********************************************************************/
42 void HCluster::clusterBins(){
44 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
46 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
47 rabund->set(clusterArray[smallRow].smallChild, 0);
48 rabund->setLabel(toString(smallDist));
50 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
53 m->errorOut(e, "HCluster", "clusterBins");
60 /***********************************************************************/
62 void HCluster::clusterNames(){
64 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
65 if (mapWanted) { updateMap(); }
67 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
68 list->set(clusterArray[smallRow].smallChild, "");
69 list->setLabel(toString(smallDist));
71 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
75 m->errorOut(e, "HCluster", "clusterNames");
80 /***********************************************************************/
81 int HCluster::getUpmostParent(int node){
83 while (clusterArray[node].parent != -1) {
84 node = clusterArray[node].parent;
90 m->errorOut(e, "HCluster", "getUpmostParent");
94 /***********************************************************************/
95 void HCluster::printInfo(){
98 cout << "link table" << endl;
99 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
100 cout << itActive->first << " = " << itActive->second << endl;
103 for (int i = 0; i < linkTable.size(); i++) {
105 for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
106 cout << it->first << '-' << it->second << '\t' ;
110 cout << endl << "clusterArray" << endl;
112 for (int i = 0; i < clusterArray.size(); i++) {
113 cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
119 catch(exception& e) {
120 m->errorOut(e, "HCluster", "getUpmostParent");
124 /***********************************************************************/
125 int HCluster::makeActive() {
129 itActive = activeLinks.find(smallRow);
130 it2Active = activeLinks.find(smallCol);
132 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
133 int size = linkTable.size();
134 map<int, int> temp; map<int, int> temp2;
136 //add link to eachother
137 temp[smallRow] = 1; // 1 2
138 temp2[smallCol] = 1; // 1 0 1
140 linkTable.push_back(temp);
141 linkTable.push_back(temp2);
144 activeLinks[smallRow] = size;
145 activeLinks[smallCol] = size+1;
147 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) { //smallRow is active, smallCol is not
148 int size = linkTable.size();
149 int alreadyActiveRow = itActive->second;
152 //add link to eachother
153 temp[smallRow] = 1; // 6 2 3 5
154 linkTable.push_back(temp); // 6 0 1 2 0
155 linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
159 activeLinks[smallCol] = size;
161 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) { //smallCol is active, smallRow is not
162 int size = linkTable.size();
163 int alreadyActiveCol = it2Active->second;
166 //add link to eachother
167 temp[smallCol] = 1; // 6 2 3 5
168 linkTable.push_back(temp); // 6 0 1 2 0
169 linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
173 activeLinks[smallRow] = size;
175 }else { //both are active so add one
176 int row = itActive->second;
177 int col = it2Active->second;
180 linkTable[row][smallCol]++;
181 linkTable[col][smallRow]++;
182 linkValue = linkTable[row][smallCol];
187 catch(exception& e) {
188 m->errorOut(e, "HCluster", "makeActive");
192 /***********************************************************************/
193 void HCluster::updateArrayandLinkTable() {
195 //if cluster was made update clusterArray and linkTable
196 int size = clusterArray.size();
199 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
200 clusterArray.push_back(temp);
203 clusterArray[smallRow].parent = size;
204 clusterArray[smallCol].parent = size;
206 if (method == "furthest") {
208 //update linkTable by merging clustered rows and columns
209 int rowSpot = activeLinks[smallRow];
210 int colSpot = activeLinks[smallCol];
213 for (int i = 0; i < linkTable.size(); i++) {
214 //check if they are in map
215 it = linkTable[i].find(smallRow);
216 it2 = linkTable[i].find(smallCol);
218 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
219 linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
220 linkTable[i].erase(smallCol); //delete col row
221 linkTable[i].erase(smallRow); //delete col row
222 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
223 linkTable[i][size] = linkTable[i][smallCol];
224 linkTable[i].erase(smallCol); //delete col
225 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
226 linkTable[i][size] = linkTable[i][smallRow];
227 linkTable[i].erase(smallRow); //delete col
232 for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
233 it2 = linkTable[colSpot].find(it->first); //does the col also have this
235 if (it2 == linkTable[colSpot].end()) { //not there so add it
236 linkTable[colSpot][it->first] = it->second;
238 linkTable[colSpot][it->first] = it->second + it2->second;
242 linkTable[colSpot].erase(size);
243 linkTable.erase(linkTable.begin()+rowSpot); //delete row
246 activeLinks.erase(smallRow);
247 activeLinks.erase(smallCol);
248 activeLinks[size] = colSpot;
250 //adjust everybody elses spot since you deleted - time vs. space
251 for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
252 if (itActive->second > rowSpot) { activeLinks[itActive->first]--; }
256 catch(exception& e) {
257 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
261 /***********************************************************************/
262 bool HCluster::update(int row, int col, float distance){
264 bool cluster = false;
267 smallDist = distance;
269 //find upmost parent of row and col
270 smallRow = getUpmostParent(smallRow);
271 smallCol = getUpmostParent(smallCol);
273 //you don't want to cluster with yourself
274 if (smallRow != smallCol) {
276 if (method != "average") {
278 if (method == "nearest") { cluster = true; }
279 else{ //assume furthest
280 //are they active in the link table
281 int linkValue = makeActive(); //after this point this nodes info is active in linkTable
282 if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
286 updateArrayandLinkTable();
292 updateArrayandLinkTable();
302 catch(exception& e) {
303 m->errorOut(e, "HCluster", "update");
307 /***********************************************************************/
308 void HCluster::setMapWanted(bool ms) {
313 for (int i = 0; i < list->getNumBins(); i++) {
316 string names = list->get(i);
317 while (names.find_first_of(',') != -1) {
319 string name = names.substr(0,names.find_first_of(','));
320 //save name and bin number
322 names = names.substr(names.find_first_of(',')+1, names.length());
330 catch(exception& e) {
331 m->errorOut(e, "HCluster", "setMapWanted");
335 /***********************************************************************/
336 void HCluster::updateMap() {
338 //update location of seqs in smallRow since they move to smallCol now
339 string names = list->get(clusterArray[smallRow].smallChild);
340 while (names.find_first_of(',') != -1) {
342 string name = names.substr(0,names.find_first_of(','));
343 //save name and bin number
344 seq2Bin[name] = clusterArray[smallCol].smallChild;
345 names = names.substr(names.find_first_of(',')+1, names.length());
349 seq2Bin[names] = clusterArray[smallCol].smallChild;
351 catch(exception& e) {
352 m->errorOut(e, "HCluster", "updateMap");
356 //**********************************************************************************************************************
357 vector<seqDist> HCluster::getSeqs(){
359 vector<seqDist> sameSeqs;
361 if(method != "average") {
362 sameSeqs = getSeqsFNNN();
364 sameSeqs = getSeqsAN();
369 catch(exception& e) {
370 m->errorOut(e, "HCluster", "getSeqs");
374 //**********************************************************************************************************************
375 vector<seqDist> HCluster::getSeqsFNNN(){
377 string firstName, secondName;
378 float distance, prevDistance;
379 vector<seqDist> sameSeqs;
382 //if you are not at the beginning of the file
384 sameSeqs.push_back(next);
385 prevDistance = next.dist;
390 while (!filehandle.eof()) {
392 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
395 if (prevDistance == -1) { prevDistance = distance; }
397 map<string,int>::iterator itA = nameMap->find(firstName);
398 map<string,int>::iterator itB = nameMap->find(secondName);
399 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
400 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
403 if (distance > cutoff) { break; }
405 if (distance != -1) { //-1 means skip me
407 //are the distances the same
408 if (distance == prevDistance) { //save in vector
409 seqDist temp(itA->second, itB->second, distance);
410 sameSeqs.push_back(temp);
413 next.seq1 = itA->second;
414 next.seq2 = itB->second;
415 next.dist = distance;
422 //rndomize matching dists
423 random_shuffle(sameSeqs.begin(), sameSeqs.end());
427 catch(exception& e) {
428 m->errorOut(e, "HCluster", "getSeqsFNNN");
432 //**********************************************************************************************************************
433 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
434 vector<seqDist> HCluster::getSeqsAN(){
436 int firstName, secondName;
438 vector<seqDist> sameSeqs;
441 openInputFile(distfile, filehandle, "no error");
443 //is the smallest value in mergedMin or the distfile?
444 float mergedMinDist = 10000;
445 float distance = 10000;
446 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
448 if (!filehandle.eof()) {
449 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
451 if (prevDistance == -1) { prevDistance = distance; }
452 if (distance != -1) { //-1 means skip me
453 seqDist temp(firstName, secondName, distance);
454 sameSeqs.push_back(temp);
455 }else{ distance = 10000; }
458 if (mergedMinDist < distance) { //get minimum distance from mergedMin
459 //remove distance we saved from file
461 prevDistance = mergedMinDist;
463 for (int i = 0; i < mergedMin.size(); i++) {
464 if (mergedMin[i].dist == prevDistance) {
465 sameSeqs.push_back(mergedMin[i]);
468 }else{ //get minimum from file
470 while (!filehandle.eof()) {
472 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
474 if (prevDistance == -1) { prevDistance = distance; }
476 if (distance != -1) { //-1 means skip me
477 //are the distances the same
478 if (distance == prevDistance) { //save in vector
479 seqDist temp(firstName, secondName, distance);
480 sameSeqs.push_back(temp);
489 //randomize matching dists
490 random_shuffle(sameSeqs.begin(), sameSeqs.end());
492 //can only return one value since once these are merged the other distances in sameSeqs may have changed
493 vector<seqDist> temp;
494 if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); }
498 catch(exception& e) {
499 m->errorOut(e, "HCluster", "getSeqsAN");
504 /***********************************************************************/
505 int HCluster::combineFile() {
507 //int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
509 //inputBuffer = new char[bufferSize];
512 string tempDistFile = distfile + ".temp";
514 openOutputFile(tempDistFile, out);
517 //in = fopen(distfile.c_str(), "rb");
520 openInputFile(distfile, in);
525 vector< map<int, float> > smallRowColValues;
526 smallRowColValues.resize(2); //0 = row, 1 = col
529 //go through file pulling out distances related to rows merging
530 //if mergedMin contains distances add those back into file
533 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
534 //cout << "number of char read = " << numRead << endl;
535 //cout << inputBuffer << endl;
536 //if (numRead < bufferSize) { done = true; }
538 //parse input into individual distances
540 //string outputString = "";
541 //while(spot < numRead) {
542 //cout << "spot = " << spot << endl;
543 // seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
545 //you read a partial distance
546 // if (nextDist.seq1 == -1) { break; }
548 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
549 //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
550 //since file is sorted and mergedMin is sorted
551 //you can put the smallest distance from each through the code below and keep the file sorted
553 in >> first >> second >> dist; gobble(in);
555 if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
557 //while there are still values in mergedMin that are smaller than the distance read from file
558 while (count < mergedMin.size()) {
560 //is the distance in mergedMin smaller than from the file
561 if (mergedMin[count].dist < dist) {
562 //is this a distance related to the columns merging?
563 //if yes, save in memory
564 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
565 }else if (mergedMin[count].seq1 == smallCol) {
566 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
567 }else if (mergedMin[count].seq2 == smallCol) {
568 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
569 }else if (mergedMin[count].seq1 == smallRow) {
570 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
571 }else if (mergedMin[count].seq2 == smallRow) {
572 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
573 }else { //if no, write to temp file
574 //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
575 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
581 //is this a distance related to the columns merging?
582 //if yes, save in memory
583 if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
584 }else if (first == smallCol) {
585 smallRowColValues[1][second] = dist;
586 }else if (second == smallCol) {
587 smallRowColValues[1][first] = dist;
588 }else if (first == smallRow) {
589 smallRowColValues[0][second] = dist;
590 }else if (second == smallRow) {
591 smallRowColValues[0][first] = dist;
593 }else { //if no, write to temp file
594 //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
595 out << first << '\t' << second << '\t' << dist << endl;
599 //out << outputString;
600 //if(done) { break; }
605 //if values in mergedMin are larger than the the largest in file then
606 while (count < mergedMin.size()) {
607 //is this a distance related to the columns merging?
608 //if yes, save in memory
609 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
610 }else if (mergedMin[count].seq1 == smallCol) {
611 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
612 }else if (mergedMin[count].seq2 == smallCol) {
613 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
614 }else if (mergedMin[count].seq1 == smallRow) {
615 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
616 }else if (mergedMin[count].seq2 == smallRow) {
617 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
619 }else { //if no, write to temp file
620 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
627 //rename tempfile to distfile
628 remove(distfile.c_str());
629 rename(tempDistFile.c_str(), distfile.c_str());
630 //cout << "remove = "<< renameOK << " rename = " << ok << endl;
632 //merge clustered rows averaging the distances
633 map<int, float>::iterator itMerge;
634 map<int, float>::iterator it2Merge;
635 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
636 //does smallRowColValues[1] have a distance to this seq too?
637 it2Merge = smallRowColValues[1].find(itMerge->first);
640 if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
642 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
643 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
644 smallRowColValues[1].erase(it2Merge);
646 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
647 mergedMin.push_back(temp);
652 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
656 catch(exception& e) {
657 m->errorOut(e, "HCluster", "combineFile");
661 /***********************************************************************
662 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
665 int indexBefore = index;
666 string first, second, distance;
667 first = ""; second = ""; distance = "";
669 //cout << "partial = " << partialDist << endl;
670 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
671 for (int i = 0; i < partialDist.size(); i++) {
673 if (partialDist[i] == '\t') { tabCount++; }
674 else { first += partialDist[i]; }
675 }else if (tabCount == 1) {
676 if (partialDist[i] == '\t') { tabCount++; }
677 else { second += partialDist[i]; }
678 }else if (tabCount == 2) {
679 distance += partialDist[i];
685 //try to get another distance
686 bool gotDist = false;
687 while (index < size) {
688 if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
692 while (index < size) {
693 if (isspace(buffer[index])) { index++; }
699 if (buffer[index] == '\t') { tabCount++; }
700 else { first += buffer[index]; }
701 }else if (tabCount == 1) {
702 if (buffer[index] == '\t') { tabCount++; }
703 else { second += buffer[index]; }
704 }else if (tabCount == 2) {
705 distance += buffer[index];
711 //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0."
712 //then you want to save the partial distance.
714 for (int i = indexBefore; i < size; i++) {
715 partialDist += buffer[i];
718 next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
720 int firstname, secondname;
723 convert(first, firstname);
724 convert(second, secondname);
725 convert(distance, dist);
727 next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
732 catch(exception& e) {
733 m->errorOut(e, "HCluster", "getNextDist");
737 /***********************************************************************/
738 int HCluster::processFile() {
740 string firstName, secondName;
744 openInputFile(distfile, in);
747 string outTemp = distfile + ".temp";
748 openOutputFile(outTemp, out);
752 if (m->control_pressed) { in.close(); out.close(); remove(outTemp.c_str()); return 0; }
754 in >> firstName >> secondName >> distance; gobble(in);
756 map<string,int>::iterator itA = nameMap->find(firstName);
757 map<string,int>::iterator itB = nameMap->find(secondName);
758 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
759 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
762 if (distance > cutoff) { break; }
764 if (distance != -1) { //-1 means skip me
765 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
772 remove(distfile.c_str());
773 rename(outTemp.c_str(), distfile.c_str());
777 catch(exception& e) {
778 m->errorOut(e, "HCluster", "processFile");
782 /***********************************************************************/