5 * Created by westcott on 10/13/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
15 /***********************************************************************/
16 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
20 numSeqs = list->getNumSeqs();
22 //initialize cluster array
23 for (int i = 0; i < numSeqs; i++) {
24 clusterNode temp(1, -1, i);
25 clusterArray.push_back(temp);
28 if (method != "average") {
29 openInputFile(distfile, filehandle);
30 }else{ firstRead = true; }
33 errorOut(e, "HCluster", "HCluster");
37 /***********************************************************************/
39 void HCluster::clusterBins(){
41 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
43 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
44 rabund->set(clusterArray[smallRow].smallChild, 0);
45 rabund->setLabel(toString(smallDist));
47 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
50 errorOut(e, "HCluster", "clusterBins");
57 /***********************************************************************/
59 void HCluster::clusterNames(){
61 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
62 if (mapWanted) { updateMap(); }
64 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
65 list->set(clusterArray[smallRow].smallChild, "");
66 list->setLabel(toString(smallDist));
68 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
72 errorOut(e, "HCluster", "clusterNames");
77 /***********************************************************************/
78 int HCluster::getUpmostParent(int node){
80 while (clusterArray[node].parent != -1) {
81 node = clusterArray[node].parent;
87 errorOut(e, "HCluster", "getUpmostParent");
91 /***********************************************************************/
92 void HCluster::printInfo(){
95 cout << "link table" << endl;
96 for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
97 cout << itActive->first << " = " << itActive->second << endl;
100 for (int i = 0; i < linkTable.size(); i++) {
102 for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
103 cout << it->first << '-' << it->second << '\t' ;
107 cout << endl << "clusterArray" << endl;
109 for (int i = 0; i < clusterArray.size(); i++) {
110 cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
116 catch(exception& e) {
117 errorOut(e, "HCluster", "getUpmostParent");
121 /***********************************************************************/
122 int HCluster::makeActive() {
126 itActive = activeLinks.find(smallRow);
127 it2Active = activeLinks.find(smallCol);
129 if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
130 int size = linkTable.size();
131 map<int, int> temp; map<int, int> temp2;
133 //add link to eachother
134 temp[smallRow] = 1; // 1 2
135 temp2[smallCol] = 1; // 1 0 1
137 linkTable.push_back(temp);
138 linkTable.push_back(temp2);
141 activeLinks[smallRow] = size;
142 activeLinks[smallCol] = size+1;
144 }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) { //smallRow is active, smallCol is not
145 int size = linkTable.size();
146 int alreadyActiveRow = itActive->second;
149 //add link to eachother
150 temp[smallRow] = 1; // 6 2 3 5
151 linkTable.push_back(temp); // 6 0 1 2 0
152 linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
156 activeLinks[smallCol] = size;
158 }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) { //smallCol is active, smallRow is not
159 int size = linkTable.size();
160 int alreadyActiveCol = it2Active->second;
163 //add link to eachother
164 temp[smallCol] = 1; // 6 2 3 5
165 linkTable.push_back(temp); // 6 0 1 2 0
166 linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
170 activeLinks[smallRow] = size;
172 }else { //both are active so add one
173 int row = itActive->second;
174 int col = it2Active->second;
177 linkTable[row][smallCol]++;
178 linkTable[col][smallRow]++;
179 linkValue = linkTable[row][smallCol];
184 catch(exception& e) {
185 errorOut(e, "HCluster", "makeActive");
189 /***********************************************************************/
190 void HCluster::updateArrayandLinkTable() {
192 //if cluster was made update clusterArray and linkTable
193 int size = clusterArray.size();
196 clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
197 clusterArray.push_back(temp);
200 clusterArray[smallRow].parent = size;
201 clusterArray[smallCol].parent = size;
203 if (method == "furthest") {
205 //update linkTable by merging clustered rows and columns
206 int rowSpot = activeLinks[smallRow];
207 int colSpot = activeLinks[smallCol];
210 for (int i = 0; i < linkTable.size(); i++) {
211 //check if they are in map
212 it = linkTable[i].find(smallRow);
213 it2 = linkTable[i].find(smallCol);
215 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
216 linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
217 linkTable[i].erase(smallCol); //delete col row
218 linkTable[i].erase(smallRow); //delete col row
219 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
220 linkTable[i][size] = linkTable[i][smallCol];
221 linkTable[i].erase(smallCol); //delete col
222 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
223 linkTable[i][size] = linkTable[i][smallRow];
224 linkTable[i].erase(smallRow); //delete col
229 for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
230 it2 = linkTable[colSpot].find(it->first); //does the col also have this
232 if (it2 == linkTable[colSpot].end()) { //not there so add it
233 linkTable[colSpot][it->first] = it->second;
235 linkTable[colSpot][it->first] = it->second + it2->second;
239 linkTable[colSpot].erase(size);
240 linkTable.erase(linkTable.begin()+rowSpot); //delete row
243 activeLinks.erase(smallRow);
244 activeLinks.erase(smallCol);
245 activeLinks[size] = colSpot;
247 //adjust everybody elses spot since you deleted - time vs. space
248 for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
249 if (itActive->second > rowSpot) { activeLinks[itActive->first]--; }
253 catch(exception& e) {
254 errorOut(e, "HCluster", "updateArrayandLinkTable");
258 /***********************************************************************/
259 bool HCluster::update(int row, int col, float distance){
261 bool cluster = false;
264 smallDist = distance;
266 //find upmost parent of row and col
267 smallRow = getUpmostParent(smallRow);
268 smallCol = getUpmostParent(smallCol);
270 //you don't want to cluster with yourself
271 if (smallRow != smallCol) {
273 if (method != "average") {
275 if (method == "nearest") { cluster = true; }
276 else{ //assume furthest
277 //are they active in the link table
278 int linkValue = makeActive(); //after this point this nodes info is active in linkTable
279 if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
283 updateArrayandLinkTable();
289 updateArrayandLinkTable();
299 catch(exception& e) {
300 errorOut(e, "HCluster", "update");
304 /***********************************************************************/
305 void HCluster::setMapWanted(bool m) {
310 for (int i = 0; i < list->getNumBins(); i++) {
313 string names = list->get(i);
314 while (names.find_first_of(',') != -1) {
316 string name = names.substr(0,names.find_first_of(','));
317 //save name and bin number
319 names = names.substr(names.find_first_of(',')+1, names.length());
327 catch(exception& e) {
328 errorOut(e, "HCluster", "setMapWanted");
332 /***********************************************************************/
333 void HCluster::updateMap() {
335 //update location of seqs in smallRow since they move to smallCol now
336 string names = list->get(clusterArray[smallRow].smallChild);
337 while (names.find_first_of(',') != -1) {
339 string name = names.substr(0,names.find_first_of(','));
340 //save name and bin number
341 seq2Bin[name] = clusterArray[smallCol].smallChild;
342 names = names.substr(names.find_first_of(',')+1, names.length());
346 seq2Bin[names] = clusterArray[smallCol].smallChild;
348 catch(exception& e) {
349 errorOut(e, "HCluster", "updateMap");
353 //**********************************************************************************************************************
354 vector<seqDist> HCluster::getSeqs(){
356 vector<seqDist> sameSeqs;
358 if(method != "average") {
359 sameSeqs = getSeqsFNNN();
361 if (firstRead) { processFile(); }
362 sameSeqs = getSeqsAN();
367 catch(exception& e) {
368 errorOut(e, "HCluster", "getSeqs");
372 //**********************************************************************************************************************
373 vector<seqDist> HCluster::getSeqsFNNN(){
375 string firstName, secondName;
376 float distance, prevDistance;
377 vector<seqDist> sameSeqs;
380 //if you are not at the beginning of the file
382 sameSeqs.push_back(next);
383 prevDistance = next.dist;
388 while (!filehandle.eof()) {
390 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
393 if (prevDistance == -1) { prevDistance = distance; }
395 map<string,int>::iterator itA = nameMap->find(firstName);
396 map<string,int>::iterator itB = nameMap->find(secondName);
397 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
398 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
401 if (distance > cutoff) { break; }
403 if (distance != -1) { //-1 means skip me
405 //are the distances the same
406 if (distance == prevDistance) { //save in vector
407 seqDist temp(itA->second, itB->second, distance);
408 sameSeqs.push_back(temp);
411 next.seq1 = itA->second;
412 next.seq2 = itB->second;
413 next.dist = distance;
420 //rndomize matching dists
421 random_shuffle(sameSeqs.begin(), sameSeqs.end());
425 catch(exception& e) {
426 errorOut(e, "HCluster", "getSeqsFNNN");
430 //**********************************************************************************************************************
431 //don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
432 vector<seqDist> HCluster::getSeqsAN(){
434 int firstName, secondName;
436 vector<seqDist> sameSeqs;
439 openInputFile(distfile, filehandle, "no error");
441 //is the smallest value in mergedMin or the distfile?
442 float mergedMinDist = 10000;
443 float distance = 10000;
444 if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
446 if (!filehandle.eof()) {
447 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
449 if (prevDistance == -1) { prevDistance = distance; }
450 if (distance != -1) { //-1 means skip me
451 seqDist temp(firstName, secondName, distance);
452 sameSeqs.push_back(temp);
456 if (mergedMinDist < distance) { //get minimum distance from mergedMin
457 //remove distance we saved from file
459 prevDistance = mergedMinDist;
461 for (int i = 0; i < mergedMin.size(); i++) {
462 if (mergedMin[i].dist == prevDistance) {
463 sameSeqs.push_back(mergedMin[i]);
466 }else{ //get minimum from file
468 while (!filehandle.eof()) {
470 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
472 if (prevDistance == -1) { prevDistance = distance; }
474 if (distance != -1) { //-1 means skip me
475 //are the distances the same
476 if (distance == prevDistance) { //save in vector
477 seqDist temp(firstName, secondName, distance);
478 sameSeqs.push_back(temp);
487 //randomize matching dists
488 random_shuffle(sameSeqs.begin(), sameSeqs.end());
490 //can only return one value since once these are merged the other distances in sameSeqs may have changed
491 vector<seqDist> temp;
492 if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); }
496 catch(exception& e) {
497 errorOut(e, "HCluster", "getSeqsAN");
502 /***********************************************************************/
503 void HCluster::combineFile() {
505 int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
507 inputBuffer = new char[bufferSize];
510 string tempDistFile = distfile + ".temp";
512 openOutputFile(tempDistFile, out);
515 in = fopen(distfile.c_str(), "rb");
520 vector< map<int, float> > smallRowColValues;
521 smallRowColValues.resize(2); //0 = row, 1 = col
524 //go through file pulling out distances related to rows merging
525 //if mergedMin contains distances add those back into file
528 while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
529 //cout << "number of char read = " << numRead << endl;
530 //cout << inputBuffer << endl;
531 if (numRead < bufferSize) { done = true; }
533 //parse input into individual distances
535 string outputString = "";
536 while(spot < numRead) {
537 //cout << "spot = " << spot << endl;
538 seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
540 //you read a partial distance
541 if (nextDist.seq1 == -1) { break; }
543 first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
544 //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
545 //since file is sorted and mergedMin is sorted
546 //you can put the smallest distance from each through the code below and keep the file sorted
548 //while there are still values in mergedMin that are smaller than the distance read from file
549 while (count < mergedMin.size()) {
551 //is the distance in mergedMin smaller than from the file
552 if (mergedMin[count].dist < dist) {
553 //is this a distance related to the columns merging?
554 //if yes, save in memory
555 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
556 }else if (mergedMin[count].seq1 == smallCol) {
557 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
558 }else if (mergedMin[count].seq2 == smallCol) {
559 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
560 }else if (mergedMin[count].seq1 == smallRow) {
561 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
562 }else if (mergedMin[count].seq2 == smallRow) {
563 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
564 }else { //if no, write to temp file
565 outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
571 //is this a distance related to the columns merging?
572 //if yes, save in memory
573 if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
574 }else if (first == smallCol) {
575 smallRowColValues[1][second] = dist;
576 }else if (second == smallCol) {
577 smallRowColValues[1][first] = dist;
578 }else if (first == smallRow) {
579 smallRowColValues[0][second] = dist;
580 }else if (second == smallRow) {
581 smallRowColValues[0][first] = dist;
583 }else { //if no, write to temp file
584 outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
593 //if values in mergedMin are larger than the the largest in file then
594 while (count < mergedMin.size()) {
595 //is this a distance related to the columns merging?
596 //if yes, save in memory
597 if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
598 }else if (mergedMin[count].seq1 == smallCol) {
599 smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
600 }else if (mergedMin[count].seq2 == smallCol) {
601 smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
602 }else if (mergedMin[count].seq1 == smallRow) {
603 smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
604 }else if (mergedMin[count].seq2 == smallRow) {
605 smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
607 }else { //if no, write to temp file
608 out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
615 //rename tempfile to distfile
616 remove(distfile.c_str());
617 rename(tempDistFile.c_str(), distfile.c_str());
619 //merge clustered rows averaging the distances
620 map<int, float>::iterator itMerge;
621 map<int, float>::iterator it2Merge;
622 for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
623 //does smallRowColValues[1] have a distance to this seq too?
624 it2Merge = smallRowColValues[1].find(itMerge->first);
627 if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
629 int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
630 average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
631 smallRowColValues[1].erase(it2Merge);
633 seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
634 mergedMin.push_back(temp);
639 sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
641 catch(exception& e) {
642 errorOut(e, "HCluster", "combineFile");
646 /***********************************************************************/
647 seqDist HCluster::getNextDist(char* buffer, int& index, int size){
650 int indexBefore = index;
651 string first, second, distance;
652 first = ""; second = ""; distance = "";
654 //cout << "partial = " << partialDist << endl;
655 if (partialDist != "") { //read what you can, you know it is less than a whole distance.
656 for (int i = 0; i < partialDist.size(); i++) {
658 if (partialDist[i] == '\t') { tabCount++; }
659 else { first += partialDist[i]; }
660 }else if (tabCount == 1) {
661 if (partialDist[i] == '\t') { tabCount++; }
662 else { second += partialDist[i]; }
663 }else if (tabCount == 2) {
664 distance += partialDist[i];
670 //try to get another distance
671 bool gotDist = false;
672 while (index < size) {
673 if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
677 while (index < size) {
678 if (isspace(buffer[index])) { index++; }
684 if (buffer[index] == '\t') { tabCount++; }
685 else { first += buffer[index]; }
686 }else if (tabCount == 1) {
687 if (buffer[index] == '\t') { tabCount++; }
688 else { second += buffer[index]; }
689 }else if (tabCount == 2) {
690 distance += buffer[index];
696 //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0."
697 //then you want to save the partial distance.
699 for (int i = indexBefore; i < size; i++) {
700 partialDist += buffer[i];
703 next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
705 int firstname, secondname;
708 convert(first, firstname);
709 convert(second, secondname);
710 convert(distance, dist);
712 next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
717 catch(exception& e) {
718 errorOut(e, "HCluster", "getNextDist");
722 /***********************************************************************/
723 void HCluster::processFile() {
725 string firstName, secondName;
729 openInputFile(distfile, in);
732 string outTemp = distfile + ".temp";
733 openOutputFile(outTemp, out);
738 in >> firstName >> secondName >> distance; gobble(in);
740 map<string,int>::iterator itA = nameMap->find(firstName);
741 map<string,int>::iterator itB = nameMap->find(secondName);
742 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
743 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
746 if (distance > cutoff) { break; }
748 if (distance != -1) { //-1 means skip me
749 out << itA->second << '\t' << itB->second << '\t' << distance << endl;
756 remove(distfile.c_str());
757 rename(outTemp.c_str(), distfile.c_str());
761 catch(exception& e) {
762 errorOut(e, "HCluster", "processFile");
766 /***********************************************************************/