5 * Created by westcott on 10/13/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
14 /***********************************************************************/
15 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
17 m = MothurOut::getInstance();
20 numSeqs = list->getNumSeqs();
22 //initialize cluster array
23 for (int i = 0; i < numSeqs; i++) {
24 clusterNode temp(1, -1, i);
25 clusterArray.push_back(temp);
28 if ((method == "furthest") || (method == "nearest")) {
29 m->openInputFile(distfile, filehandle);
35 m->errorOut(e, "HCluster", "HCluster");
39 /***********************************************************************/
41 void HCluster::clusterBins(){
43 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
45 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
46 rabund->set(clusterArray[smallRow].smallChild, 0);
47 rabund->setLabel(toString(smallDist));
49 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
52 m->errorOut(e, "HCluster", "clusterBins");
59 /***********************************************************************/
61 void HCluster::clusterNames(){
63 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
64 if (mapWanted) { updateMap(); }
66 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
67 list->set(clusterArray[smallRow].smallChild, "");
68 list->setLabel(toString(smallDist));
70 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
74 m->errorOut(e, "HCluster", "clusterNames");
79 /***********************************************************************/
80 int HCluster::getUpmostParent(int node){
82 while (clusterArray[node].parent != -1) {
83 node = clusterArray[node].parent;
89 m->errorOut(e, "HCluster", "getUpmostParent");
93 /***********************************************************************/
94 void HCluster::printInfo(){
97 cout << "link table" << endl;
98 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
99 cout << itActive->first << " = " << itActive->second << endl;
102 for (int i = 0; i < linkTable.size(); i++) {
104 for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
105 cout << it->first << '-' << it->second << '\t' ;
109 cout << endl << "clusterArray" << endl;
111 for (int i = 0; i < clusterArray.size(); i++) {
112 cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
118 catch(exception& e) {
119 m->errorOut(e, "HCluster", "getUpmostParent");
123 /***********************************************************************/
124 int HCluster::makeActive() {
128 itActive = activeLinks.find(smallRow);
129 it2Active = activeLinks.find(smallCol);
131 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
132 int size = linkTable.size();
133 map<int, int> temp; map<int, int> temp2;
135 //add link to eachother
136 temp[smallRow] = 1; // 1 2
137 temp2[smallCol] = 1; // 1 0 1
139 linkTable.push_back(temp);
140 linkTable.push_back(temp2);
143 activeLinks[smallRow] = size;
144 activeLinks[smallCol] = size+1;
146 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) { //smallRow is active, smallCol is not
147 int size = linkTable.size();
148 int alreadyActiveRow = itActive->second;
151 //add link to eachother
152 temp[smallRow] = 1; // 6 2 3 5
153 linkTable.push_back(temp); // 6 0 1 2 0
154 linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
158 activeLinks[smallCol] = size;
160 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) { //smallCol is active, smallRow is not
161 int size = linkTable.size();
162 int alreadyActiveCol = it2Active->second;
165 //add link to eachother
166 temp[smallCol] = 1; // 6 2 3 5
167 linkTable.push_back(temp); // 6 0 1 2 0
168 linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
172 activeLinks[smallRow] = size;
174 }else { //both are active so add one
175 int row = itActive->second;
176 int col = it2Active->second;
179 linkTable[row][smallCol]++;
180 linkTable[col][smallRow]++;
181 linkValue = linkTable[row][smallCol];
186 catch(exception& e) {
187 m->errorOut(e, "HCluster", "makeActive");
191 /***********************************************************************/
192 void HCluster::updateArrayandLinkTable() {
194 //if cluster was made update clusterArray and linkTable
195 int size = clusterArray.size();
198 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
199 clusterArray.push_back(temp);
202 clusterArray[smallRow].parent = size;
203 clusterArray[smallCol].parent = size;
205 if (method == "furthest") {
207 //update linkTable by merging clustered rows and columns
208 int rowSpot = activeLinks[smallRow];
209 int colSpot = activeLinks[smallCol];
212 for (int i = 0; i < linkTable.size(); i++) {
213 //check if they are in map
214 it = linkTable[i].find(smallRow);
215 it2 = linkTable[i].find(smallCol);
217 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
218 linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
219 linkTable[i].erase(smallCol); //delete col row
220 linkTable[i].erase(smallRow); //delete col row
221 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
222 linkTable[i][size] = linkTable[i][smallCol];
223 linkTable[i].erase(smallCol); //delete col
224 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
225 linkTable[i][size] = linkTable[i][smallRow];
226 linkTable[i].erase(smallRow); //delete col
231 for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
232 it2 = linkTable[colSpot].find(it->first); //does the col also have this
234 if (it2 == linkTable[colSpot].end()) { //not there so add it
235 linkTable[colSpot][it->first] = it->second;
237 linkTable[colSpot][it->first] = it->second + it2->second;
241 linkTable[colSpot].erase(size);
242 linkTable.erase(linkTable.begin()+rowSpot); //delete row
245 activeLinks.erase(smallRow);
246 activeLinks.erase(smallCol);
247 activeLinks[size] = colSpot;
249 //adjust everybody elses spot since you deleted - time vs. space
250 for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
251 if (itActive->second > rowSpot) { activeLinks[itActive->first]--; }
255 catch(exception& e) {
256 m->errorOut(e, "HCluster", "updateArrayandLinkTable");
260 /***********************************************************************/
261 double HCluster::update(int row, int col, float distance){
263 bool cluster = false;
266 smallDist = distance;
268 //find upmost parent of row and col
269 smallRow = getUpmostParent(smallRow);
270 smallCol = getUpmostParent(smallCol);
272 //you don't want to cluster with yourself
273 if (smallRow != smallCol) {
275 if ((method == "furthest") || (method == "nearest")) {
277 if (method == "nearest") { cluster = true; }
278 else{ //assume furthest
279 //are they active in the link table
280 int linkValue = makeActive(); //after this point this nodes info is active in linkTable
281 if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
285 updateArrayandLinkTable();
291 updateArrayandLinkTable();
301 catch(exception& e) {
302 m->errorOut(e, "HCluster", "update");
306 /***********************************************************************/
307 void HCluster::setMapWanted(bool ms) {
312 for (int i = 0; i < list->getNumBins(); i++) {
315 string names = list->get(i);
316 while (names.find_first_of(',') != -1) {
318 string name = names.substr(0,names.find_first_of(','));
319 //save name and bin number
321 names = names.substr(names.find_first_of(',')+1, names.length());
329 catch(exception& e) {
330 m->errorOut(e, "HCluster", "setMapWanted");
334 /***********************************************************************/
335 void HCluster::updateMap() {
337 //update location of seqs in smallRow since they move to smallCol now
338 string names = list->get(clusterArray[smallRow].smallChild);
339 while (names.find_first_of(',') != -1) {
341 string name = names.substr(0,names.find_first_of(','));
342 //save name and bin number
343 seq2Bin[name] = clusterArray[smallCol].smallChild;
344 names = names.substr(names.find_first_of(',')+1, names.length());
348 seq2Bin[names] = clusterArray[smallCol].smallChild;
350 catch(exception& e) {
351 m->errorOut(e, "HCluster", "updateMap");
355 //**********************************************************************************************************************
356 vector<seqDist> HCluster::getSeqs(){
358 vector<seqDist> sameSeqs;
360 if ((method == "furthest") || (method == "nearest")) {
361 sameSeqs = getSeqsFNNN();
363 sameSeqs = getSeqsAN();
368 catch(exception& e) {
369 m->errorOut(e, "HCluster", "getSeqs");
373 //**********************************************************************************************************************
374 vector<seqDist> HCluster::getSeqsFNNN(){
376 string firstName, secondName;
377 float distance, prevDistance;
378 vector<seqDist> sameSeqs;
381 //if you are not at the beginning of the file
383 sameSeqs.push_back(next);
384 prevDistance = next.dist;
389 while (!filehandle.eof()) {
391 filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
394 if (prevDistance == -1) { prevDistance = distance; }
396 map<string,int>::iterator itA = nameMap->find(firstName);
397 map<string,int>::iterator itB = nameMap->find(secondName);
398 if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
399 if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
402 if (distance > cutoff) { break; }
404 if (distance != -1) { //-1 means skip me
406 //are the distances the same
407 if (distance == prevDistance) { //save in vector
408 seqDist temp(itA->second, itB->second, distance);
409 sameSeqs.push_back(temp);
412 next.seq1 = itA->second;
413 next.seq2 = itB->second;
414 next.dist = distance;
421 //rndomize matching dists
422 random_shuffle(sameSeqs.begin(), sameSeqs.end());
426 catch(exception& e) {
427 m->errorOut(e, "HCluster", "getSeqsFNNN");
431 //**********************************************************************************************************************
432 vector<seqDist> HCluster::getSeqsAN(){
434 int firstName, secondName;
436 vector<seqDist> sameSeqs;
439 m->openInputFile(distfile, filehandle, "no error");
441 //is the smallest value in mergedMin or the distfile?
442 float mergedMinDist = 10000;
443 float distance = 10000;
444 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
446 if (!filehandle.eof()) {
447 filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
449 if (prevDistance == -1) { prevDistance = distance; }
450 if (distance != -1) { //-1 means skip me
451 seqDist temp(firstName, secondName, distance);
452 sameSeqs.push_back(temp);
453 }else{ distance = 10000; }
456 if (mergedMinDist < distance) { //get minimum distance from mergedMin
457 //remove distance we saved from file
459 prevDistance = mergedMinDist;
461 for (int i = 0; i < mergedMin.size(); i++) {
462 if (mergedMin[i].dist == prevDistance) {
463 sameSeqs.push_back(mergedMin[i]);
466 }else{ //get minimum from file
468 while (!filehandle.eof()) {
470 filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
472 if (prevDistance == -1) { prevDistance = distance; }
474 if (distance != -1) { //-1 means skip me
475 //are the distances the same
476 if (distance == prevDistance) { //save in vector
477 seqDist temp(firstName, secondName, distance);
478 sameSeqs.push_back(temp);
487 //randomize matching dists
488 random_shuffle(sameSeqs.begin(), sameSeqs.end());
490 //can only return one value since once these are merged the other distances in sameSeqs may have changed
491 vector<seqDist> temp;
492 if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); }
496 catch(exception& e) {
497 m->errorOut(e, "HCluster", "getSeqsAN");
502 /***********************************************************************/
503 int HCluster::combineFile() {
505 //int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
507 //inputBuffer = new char[bufferSize];
510 string tempDistFile = distfile + ".temp";
512 m->openOutputFile(tempDistFile, out);
515 //in = fopen(distfile.c_str(), "rb");
518 m->openInputFile(distfile, in, "no error");
523 vector< map<int, float> > smallRowColValues;
524 smallRowColValues.resize(2); //0 = row, 1 = col
527 //go through file pulling out distances related to rows merging
528 //if mergedMin contains distances add those back into file
531 //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
532 //cout << "number of char read = " << numRead << endl;
533 //cout << inputBuffer << endl;
534 //if (numRead < bufferSize) { done = true; }
536 //parse input into individual distances
538 //string outputString = "";
539 //while(spot < numRead) {
540 //cout << "spot = " << spot << endl;
541 // seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
543 //you read a partial distance
544 // if (nextDist.seq1 == -1) { break; }
546 //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
547 //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
548 //since file is sorted and mergedMin is sorted
549 //you can put the smallest distance from each through the code below and keep the file sorted
551 in >> first >> second >> dist; m->gobble(in);
553 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
555 //while there are still values in mergedMin that are smaller than the distance read from file
556 while (count < mergedMin.size()) {
558 //is the distance in mergedMin smaller than from the file
559 if (mergedMin[count].dist < dist) {
560 //is this a distance related to the columns merging?
561 //if yes, save in memory
562 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
563 }else if (mergedMin[count].seq1 == smallCol) {
564 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
565 }else if (mergedMin[count].seq2 == smallCol) {
566 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
567 }else if (mergedMin[count].seq1 == smallRow) {
568 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
569 }else if (mergedMin[count].seq2 == smallRow) {
570 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
571 }else { //if no, write to temp file
572 //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
573 //if (mergedMin[count].dist < cutoff) {
574 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
581 //is this a distance related to the columns merging?
582 //if yes, save in memory
583 if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
584 }else if (first == smallCol) {
585 smallRowColValues[1][second] = dist;
586 }else if (second == smallCol) {
587 smallRowColValues[1][first] = dist;
588 }else if (first == smallRow) {
589 smallRowColValues[0][second] = dist;
590 }else if (second == smallRow) {
591 smallRowColValues[0][first] = dist;
593 }else { //if no, write to temp file
594 //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
595 //if (dist < cutoff) {
596 out << first << '\t' << second << '\t' << dist << endl;
601 //out << outputString;
602 //if(done) { break; }
607 //if values in mergedMin are larger than the the largest in file then
608 while (count < mergedMin.size()) {
609 //is this a distance related to the columns merging?
610 //if yes, save in memory
611 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
612 }else if (mergedMin[count].seq1 == smallCol) {
613 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
614 }else if (mergedMin[count].seq2 == smallCol) {
615 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
616 }else if (mergedMin[count].seq1 == smallRow) {
617 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
618 }else if (mergedMin[count].seq2 == smallRow) {
619 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
621 }else { //if no, write to temp file
622 //if (mergedMin[count].dist < cutoff) {
623 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
631 //rename tempfile to distfile
632 m->mothurRemove(distfile);
633 rename(tempDistFile.c_str(), distfile.c_str());
634 //cout << "remove = "<< renameOK << " rename = " << ok << endl;
636 //merge clustered rows averaging the distances
637 map<int, float>::iterator itMerge;
638 map<int, float>::iterator it2Merge;
639 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
640 //does smallRowColValues[1] have a distance to this seq too?
641 it2Merge = smallRowColValues[1].find(itMerge->first);
644 if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
646 if (method == "average") {
647 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
648 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
650 average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;
653 smallRowColValues[1].erase(it2Merge);
655 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
656 mergedMin.push_back(temp);
658 //can't find value so update cutoff
659 if (cutoff > itMerge->second) { cutoff = itMerge->second; }
664 for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) {
665 if (cutoff > itMerge->second) { cutoff = itMerge->second; }
669 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
673 catch(exception& e) {
674 m->errorOut(e, "HCluster", "combineFile");
678 /***********************************************************************
679 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
682 int indexBefore = index;
683 string first, second, distance;
684 first = ""; second = ""; distance = "";
686 //cout << "partial = " << partialDist << endl;
687 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
688 for (int i = 0; i < partialDist.size(); i++) {
690 if (partialDist[i] == '\t') { tabCount++; }
691 else { first += partialDist[i]; }
692 }else if (tabCount == 1) {
693 if (partialDist[i] == '\t') { tabCount++; }
694 else { second += partialDist[i]; }
695 }else if (tabCount == 2) {
696 distance += partialDist[i];
702 //try to get another distance
703 bool gotDist = false;
704 while (index < size) {
705 if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
709 while (index < size) {
710 if (isspace(buffer[index])) { index++; }
716 if (buffer[index] == '\t') { tabCount++; }
717 else { first += buffer[index]; }
718 }else if (tabCount == 1) {
719 if (buffer[index] == '\t') { tabCount++; }
720 else { second += buffer[index]; }
721 }else if (tabCount == 2) {
722 distance += buffer[index];
728 //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0."
729 //then you want to save the partial distance.
731 for (int i = indexBefore; i < size; i++) {
732 partialDist += buffer[i];
735 next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
737 int firstname, secondname;
740 convert(first, firstname);
741 convert(second, secondname);
742 convert(distance, dist);
744 next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
749 catch(exception& e) {
750 m->errorOut(e, "HCluster", "getNextDist");
754 ***********************************************************************/
755 int HCluster::processFile() {
757 string firstName, secondName;
761 m->openInputFile(distfile, in, "no error");
764 string outTemp = distfile + ".temp";
765 m->openOutputFile(outTemp, out);
769 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
771 in >> firstName >> secondName >> distance; m->gobble(in);
773 map<string,int>::iterator itA = nameMap->find(firstName);
774 map<string,int>::iterator itB = nameMap->find(secondName);
775 if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
776 if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
779 if (distance > cutoff) { break; }
781 if (distance != -1) { //-1 means skip me
782 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
789 m->mothurRemove(distfile);
790 rename(outTemp.c_str(), distfile.c_str());
794 catch(exception& e) {
795 m->errorOut(e, "HCluster", "processFile");
799 /***********************************************************************/