5 * Created by westcott on 10/29/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clusterclassic.h"
11 #include "progress.hpp"
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f, bool s) : method(f), smallDist(1e6), nseqs(0), sim(s) {
16 mapWanted = false; //set to true by mgcluster to speed up overlap merge
18 //save so you can modify as it changes in average neighbor
20 aboveCutoff = cutoff + 10000.0;
21 m = MothurOut::getInstance();
22 if(method == "furthest") { tag = "fn"; }
23 else if (method == "average") { tag = "an"; }
24 else if (method == "weighted") { tag = "wn"; }
25 else if (method == "nearest") { tag = "nn"; }
28 m->errorOut(e, "ClusterClassic", "ClusterClassic");
32 /***********************************************************************/
33 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
38 vector<string> matrixNames;
41 m->openInputFile(filename, fileHandle);
44 fileHandle >> numTest >> name;
46 if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
47 else { convert(numTest, nseqs); }
50 matrixNames.push_back(name);
53 list = new ListVector(nseqs);
57 list = new ListVector(nameMap->getListVector());
58 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
61 //initialize distance matrix to cutoff
62 dMatrix.resize(nseqs);
63 //colDist temp(0, 0, aboveCutoff);
64 //rowSmallDists.resize(nseqs, temp);
65 for (int i = 1; i < nseqs; i++) {
66 dMatrix[i].resize(i, aboveCutoff);
71 while((d=fileHandle.get()) != EOF){
75 fileHandle.putback(d);
76 for(int i=0;i<nseqs;i++){
77 fileHandle >> distance;
91 reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
95 for(int i=1;i<nseqs;i++){
96 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
99 matrixNames.push_back(name);
102 //there's A LOT of repeated code throughout this method...
106 for(int j=0;j<i;j++){
108 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
110 fileHandle >> distance;
112 if (distance == -1) { distance = 1000000; }
113 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
115 //if(distance < cutoff){
116 dMatrix[i][j] = distance;
117 if (distance < smallDist) { smallDist = distance; }
118 //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
119 //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
122 reading->update(index);
127 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
129 for(int j=0;j<i;j++){
130 fileHandle >> distance;
132 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
134 if (distance == -1) { distance = 1000000; }
135 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
137 //if(distance < cutoff){
138 if (distance < smallDist) { smallDist = distance; }
140 int row = nameMap->get(matrixNames[i]);
141 int col = nameMap->get(matrixNames[j]);
143 if (row < col) { dMatrix[col][row] = distance; }
144 else { dMatrix[row][col] = distance; }
146 //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
147 //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
150 reading->update(index);
157 reading = new Progress("Reading matrix: ", nseqs * nseqs);
161 for(int i=1;i<nseqs;i++){
163 matrixNames.push_back(name);
167 for(int j=0;j<nseqs;j++){
168 fileHandle >> distance;
170 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
172 if (distance == -1) { distance = 1000000; }
173 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
176 if (distance < smallDist) { smallDist = distance; }
178 dMatrix[i][j] = distance;
179 //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
180 //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
183 reading->update(index);
188 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
190 for(int j=0;j<nseqs;j++){
191 fileHandle >> distance;
193 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
195 if (distance == -1) { distance = 1000000; }
196 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
199 if (distance < smallDist) { smallDist = distance; }
201 int row = nameMap->get(matrixNames[i]);
202 int col = nameMap->get(matrixNames[j]);
204 if (row < col) { dMatrix[col][row] = distance; }
205 else { dMatrix[row][col] = distance; }
207 //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
208 //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
211 reading->update(index);
217 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
223 rabund = new RAbundVector(list->getRAbundVector());
229 catch(exception& e) {
230 m->errorOut(e, "ClusterClassic", "readPhylipFile");
235 /***********************************************************************/
236 int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) {
241 vector<string> matrixNames;
244 m->openInputFile(filename, fileHandle);
247 fileHandle >> numTest >> name;
249 if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
250 else { convert(numTest, nseqs); }
253 matrixNames.push_back(name);
255 if(countTable == NULL){
256 list = new ListVector(nseqs);
259 else{ list = new ListVector(countTable->getListVector()); }
262 //initialize distance matrix to cutoff
263 dMatrix.resize(nseqs);
264 //rowSmallDists.resize(nseqs, temp);
265 for (int i = 1; i < nseqs; i++) {
266 dMatrix[i].resize(i, aboveCutoff);
271 while((d=fileHandle.get()) != EOF){
275 fileHandle.putback(d);
276 for(int i=0;i<nseqs;i++){
277 fileHandle >> distance;
291 reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
295 for(int i=1;i<nseqs;i++){
296 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
299 matrixNames.push_back(name);
302 //there's A LOT of repeated code throughout this method...
303 if(countTable == NULL){
306 for(int j=0;j<i;j++){
308 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
310 fileHandle >> distance;
312 if (distance == -1) { distance = 1000000; }
313 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
315 //if(distance < cutoff){
316 dMatrix[i][j] = distance;
317 if (distance < smallDist) { smallDist = distance; }
318 //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
319 //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
322 reading->update(index);
327 for(int j=0;j<i;j++){
328 fileHandle >> distance;
330 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
332 if (distance == -1) { distance = 1000000; }
333 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
335 if (distance < smallDist) { smallDist = distance; }
337 int row = countTable->get(matrixNames[i]);
338 int col = countTable->get(matrixNames[j]);
340 if (row < col) { dMatrix[col][row] = distance; }
341 else { dMatrix[row][col] = distance; }
344 reading->update(index);
351 reading = new Progress("Reading matrix: ", nseqs * nseqs);
355 for(int i=1;i<nseqs;i++){
357 matrixNames.push_back(name);
359 if(countTable == NULL){
361 for(int j=0;j<nseqs;j++){
362 fileHandle >> distance;
364 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
366 if (distance == -1) { distance = 1000000; }
367 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
370 if (distance < smallDist) { smallDist = distance; }
372 dMatrix[i][j] = distance;
375 reading->update(index);
381 for(int j=0;j<nseqs;j++){
382 fileHandle >> distance;
384 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
386 if (distance == -1) { distance = 1000000; }
387 else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
390 if (distance < smallDist) { smallDist = distance; }
392 int row = countTable->get(matrixNames[i]);
393 int col = countTable->get(matrixNames[j]);
395 if (row < col) { dMatrix[col][row] = distance; }
396 else { dMatrix[row][col] = distance; }
399 reading->update(index);
405 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
411 rabund = new RAbundVector();
412 rabund->setLabel(list->getLabel());
414 for(int i = 0; i < list->getNumBins(); i++) {
415 if (m->control_pressed) { break; }
416 vector<string> binNames;
417 string bin = list->get(i);
418 m->splitAtComma(bin, binNames);
420 for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]); }
421 rabund->push_back(total);
428 catch(exception& e) {
429 m->errorOut(e, "ClusterClassic", "readPhylipFile");
434 /***********************************************************************/
435 //sets smallCol and smallRow, returns distance
436 double ClusterClassic::getSmallCell() {
439 smallDist = aboveCutoff;
443 vector<colDist> mins;
445 for(int i=1;i<nseqs;i++){
446 for(int j=0;j<i;j++){
447 if (dMatrix[i][j] < smallDist) {
449 colDist temp(i, j, dMatrix[i][j]);
450 mins.push_back(temp);
451 smallDist = dMatrix[i][j];
452 }else if (dMatrix[i][j] == smallDist) {
453 colDist temp(i, j, dMatrix[i][j]);
454 mins.push_back(temp);
460 if (mins.size() > 0) {
462 if (mins.size() > 1) {
463 //pick random number between 0 and mins.size()
464 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
467 smallRow = mins[zrand].row;
468 smallCol = mins[zrand].col;
471 //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
472 //eliminate smallCell
473 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
474 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
479 catch(exception& e) {
480 m->errorOut(e, "ClusterClassic", "getSmallCell");
484 /***********************************************************************/
485 void ClusterClassic::clusterBins(){
487 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
489 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));
490 rabund->set(smallCol, 0);
491 /*for (int i = smallCol+1; i < rabund->size(); i++) {
492 rabund->set((i-1), rabund->get(i));
494 rabund->resize((rabund->size()-1));*/
495 rabund->setLabel(toString(smallDist));
497 // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
499 catch(exception& e) {
500 m->errorOut(e, "ClusterClassic", "clusterBins");
504 /***********************************************************************/
505 void ClusterClassic::clusterNames(){
507 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
508 if (mapWanted) { updateMap(); }
510 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
511 list->set(smallCol, "");
512 /*for (int i = smallCol+1; i < list->size(); i++) {
513 list->set((i-1), list->get(i));
515 list->resize((list->size()-1));*/
516 list->setLabel(toString(smallDist));
518 // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
520 catch(exception& e) {
521 m->errorOut(e, "ClusterClassic", "clusterNames");
525 /***********************************************************************/
526 void ClusterClassic::update(double& cutOFF){
532 r = smallRow; c = smallCol;
534 for(int i=0;i<nseqs;i++){
535 if(i != r && i != c){
536 double distRow, distCol, newDist;
537 if (i > r) { distRow = dMatrix[i][r]; }
538 else { distRow = dMatrix[r][i]; }
540 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
541 else { distCol = dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
543 if(method == "furthest"){
544 newDist = max(distRow, distCol);
546 else if (method == "average"){
547 int rowBin = rabund->get(r);
548 int colBin = rabund->get(c);
549 newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
551 else if (method == "weighted"){
552 newDist = (distCol + distRow) / 2.0;
554 else if (method == "nearest"){
555 newDist = min(distRow, distCol);
557 //cout << "newDist = " << newDist << endl;
558 if (i > r) { dMatrix[i][r] = newDist; }
559 else { dMatrix[r][i] = newDist; }
568 /*for(int i=0;i<nseqs;i++){
569 for(int j=c+1;j<dMatrix[i].size();j++){
570 dMatrix[i][j-1]=dMatrix[i][j];
575 for(int i=c+1;i<nseqs;i++){
576 for(int j=0;j < dMatrix[i-1].size();j++){
577 dMatrix[i-1][j]=dMatrix[i][j];
582 dMatrix.pop_back();*/
585 catch(exception& e) {
586 m->errorOut(e, "ClusterClassic", "update");
590 /***********************************************************************/
591 void ClusterClassic::setMapWanted(bool f) {
596 for (int i = 0; i < list->getNumBins(); i++) {
599 string names = list->get(i);
600 vector<string> binnames;
601 m->splitAtComma(names, binnames);
602 for (int j = 0; j < binnames.size(); j++) {
603 //save name and bin number
604 seq2Bin[binnames[j]] = i;
609 catch(exception& e) {
610 m->errorOut(e, "ClusterClassic", "setMapWanted");
614 /***********************************************************************/
615 void ClusterClassic::updateMap() {
617 //update location of seqs in smallRow since they move to smallCol now
618 string names = list->get(smallRow);
619 vector<string> binnames;
620 m->splitAtComma(names, binnames);
621 for (int j = 0; j < binnames.size(); j++) {
622 //save name and bin number
623 seq2Bin[binnames[j]] = smallCol;
627 catch(exception& e) {
628 m->errorOut(e, "ClusterClassic", "updateMap");
632 /***********************************************************************/
633 void ClusterClassic::print() {
635 //update location of seqs in smallRow since they move to smallCol now
636 for (int i = 0; i < dMatrix.size(); i++) {
637 m->mothurOut("row = " + toString(i) + "\t");
638 for (int j = 0; j < dMatrix[i].size(); j++) {
639 m->mothurOut(toString(dMatrix[i][j]) + "\t");
641 m->mothurOutEndLine();
644 catch(exception& e) {
645 m->errorOut(e, "ClusterClassic", "updateMap");
649 /***********************************************************************/