5 * Created by westcott on 10/29/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clusterclassic.h"
11 #include "progress.hpp"
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), nseqs(0) {
16 mapWanted = false; //set to true by mgcluster to speed up overlap merge
18 //save so you can modify as it changes in average neighbor
20 aboveCutoff = cutoff + 10000.0;
21 m = MothurOut::getInstance();
22 globaldata = GlobalData::getInstance();
25 m->errorOut(e, "ClusterClassic", "ClusterClassic");
29 /***********************************************************************/
30 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
35 vector<string> matrixNames;
38 m->openInputFile(filename, fileHandle);
40 fileHandle >> nseqs >> name;
42 matrixNames.push_back(name);
45 list = new ListVector(nseqs);
49 list = new ListVector(nameMap->getListVector());
50 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
53 //initialize distance matrix to cutoff
54 dMatrix.resize(nseqs);
55 //colDist temp(0, 0, aboveCutoff);
56 //rowSmallDists.resize(nseqs, temp);
57 for (int i = 1; i < nseqs; i++) {
58 dMatrix[i].resize(i, aboveCutoff);
63 while((d=fileHandle.get()) != EOF){
67 fileHandle.putback(d);
68 for(int i=0;i<nseqs;i++){
69 fileHandle >> distance;
83 reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
87 for(int i=1;i<nseqs;i++){
88 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
91 matrixNames.push_back(name);
94 //there's A LOT of repeated code throughout this method...
100 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
102 fileHandle >> distance;
104 if (distance == -1) { distance = 1000000; }
105 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
107 //if(distance < cutoff){
108 dMatrix[i][j] = distance;
109 if (distance < smallDist) { smallDist = distance; }
110 //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
111 //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
114 reading->update(index);
119 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
121 for(int j=0;j<i;j++){
122 fileHandle >> distance;
124 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
126 if (distance == -1) { distance = 1000000; }
127 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
129 //if(distance < cutoff){
130 if (distance < smallDist) { smallDist = distance; }
132 int row = nameMap->get(matrixNames[i]);
133 int col = nameMap->get(matrixNames[j]);
135 if (row < col) { dMatrix[col][row] = distance; }
136 else { dMatrix[row][col] = distance; }
138 //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
139 //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
142 reading->update(index);
149 reading = new Progress("Reading matrix: ", nseqs * nseqs);
153 for(int i=1;i<nseqs;i++){
155 matrixNames.push_back(name);
159 for(int j=0;j<nseqs;j++){
160 fileHandle >> distance;
162 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
164 if (distance == -1) { distance = 1000000; }
165 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
168 if (distance < smallDist) { smallDist = distance; }
170 dMatrix[i][j] = distance;
171 //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
172 //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
175 reading->update(index);
180 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
182 for(int j=0;j<nseqs;j++){
183 fileHandle >> distance;
185 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
187 if (distance == -1) { distance = 1000000; }
188 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
191 if (distance < smallDist) { smallDist = distance; }
193 int row = nameMap->get(matrixNames[i]);
194 int col = nameMap->get(matrixNames[j]);
196 if (row < col) { dMatrix[col][row] = distance; }
197 else { dMatrix[row][col] = distance; }
199 //if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
200 //if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
203 reading->update(index);
209 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
215 rabund = new RAbundVector(list->getRAbundVector());
219 catch(exception& e) {
220 m->errorOut(e, "ClusterClassic", "readPhylipFile");
225 /***********************************************************************/
226 //sets smallCol and smallRow, returns distance
227 double ClusterClassic::getSmallCell() {
230 smallDist = aboveCutoff;
234 vector<colDist> mins;
236 for(int i=1;i<nseqs;i++){
237 for(int j=0;j<i;j++){
238 if (dMatrix[i][j] < smallDist) {
240 colDist temp(i, j, dMatrix[i][j]);
241 mins.push_back(temp);
242 smallDist = dMatrix[i][j];
243 }else if (dMatrix[i][j] == smallDist) {
244 colDist temp(i, j, dMatrix[i][j]);
245 mins.push_back(temp);
251 if (mins.size() > 0) {
253 if (mins.size() > 1) {
254 //pick random number between 0 and mins.size()
255 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
258 smallRow = mins[zrand].row;
259 smallCol = mins[zrand].col;
262 //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
263 //eliminate smallCell
264 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
265 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
270 catch(exception& e) {
271 m->errorOut(e, "ClusterClassic", "getSmallCell");
275 /***********************************************************************/
276 void ClusterClassic::clusterBins(){
278 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
280 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));
281 rabund->set(smallCol, 0);
282 rabund->setLabel(toString(smallDist));
284 // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
286 catch(exception& e) {
287 m->errorOut(e, "ClusterClassic", "clusterBins");
291 /***********************************************************************/
292 void ClusterClassic::clusterNames(){
294 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
295 if (mapWanted) { updateMap(); }
297 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
298 list->set(smallCol, "");
299 list->setLabel(toString(smallDist));
301 // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
303 catch(exception& e) {
304 m->errorOut(e, "ClusterClassic", "clusterNames");
308 /***********************************************************************/
309 void ClusterClassic::update(double& cutOFF){
311 //cout << "before update " << endl;
316 r = smallRow; c = smallCol;
317 //because we only store lt, we need to make sure we grab the right location
318 //if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value
319 //else { r = smallRow; c = smallCol; } //smallRow is the row value
321 //reset rows smallest distance
322 //rowSmallDists[r].dist = aboveCutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0;
323 //rowSmallDists[c].dist = aboveCutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0;
325 //if your rows smallest distance is from smallRow or smallCol, reset
326 //for(int i=0;i<nseqs;i++){
327 //if ((rowSmallDists[i].row == r) || (rowSmallDists[i].row == c) || (rowSmallDists[i].col == r) || (rowSmallDists[i].col == c)) {
328 // rowSmallDists[i].dist = aboveCutoff; rowSmallDists[i].row = 0; rowSmallDists[i].col = 0;
332 for(int i=0;i<nseqs;i++){
333 if(i != r && i != c){
334 double distRow, distCol, newDist;
335 if (i > r) { distRow = dMatrix[i][r]; }
336 else { distRow = dMatrix[r][i]; }
338 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
339 else { distCol = dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
341 if(method == "furthest"){
342 newDist = max(distRow, distCol);
344 else if (method == "average"){
345 int rowBin = rabund->get(r);
346 int colBin = rabund->get(c);
347 newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
349 else if (method == "weighted"){
350 newDist = (distCol + distRow) / 2.0;
352 else if (method == "nearest"){
353 newDist = min(distRow, distCol);
356 if (i > r) { dMatrix[i][r] = newDist; }
357 else { dMatrix[r][i] = newDist; }
359 //if (newDist < rowSmallDists[i].dist) { rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i; }
361 //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl;
367 //find new small for 2 rows we just merged
368 //colDist temp(0,0,100.0);
369 //rowSmallDists[r] = temp;
371 //for (int i = 0; i < dMatrix[r].size(); i++) {
372 // if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
374 //for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) {
375 // if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
378 //rowSmallDists[c] = temp;
379 //for (int i = 0; i < dMatrix[c].size(); i++) {
380 // if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
382 //for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) {
383 // if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
386 //cout << "after update " << endl;
389 catch(exception& e) {
390 m->errorOut(e, "ClusterClassic", "update");
394 /***********************************************************************/
395 void ClusterClassic::setMapWanted(bool f) {
400 for (int i = 0; i < list->getNumBins(); i++) {
403 string names = list->get(i);
404 while (names.find_first_of(',') != -1) {
406 string name = names.substr(0,names.find_first_of(','));
407 //save name and bin number
409 names = names.substr(names.find_first_of(',')+1, names.length());
417 catch(exception& e) {
418 m->errorOut(e, "ClusterClassic", "setMapWanted");
422 /***********************************************************************/
423 void ClusterClassic::updateMap() {
425 //update location of seqs in smallRow since they move to smallCol now
426 string names = list->get(smallRow);
427 while (names.find_first_of(',') != -1) {
429 string name = names.substr(0,names.find_first_of(','));
430 //save name and bin number
431 seq2Bin[name] = smallCol;
432 names = names.substr(names.find_first_of(',')+1, names.length());
436 seq2Bin[names] = smallCol;
439 catch(exception& e) {
440 m->errorOut(e, "ClusterClassic", "updateMap");
444 /***********************************************************************/
445 void ClusterClassic::print() {
447 //update location of seqs in smallRow since they move to smallCol now
448 for (int i = 0; i < dMatrix.size(); i++) {
449 cout << "row = " << i << '\t';
450 for (int j = 0; j < dMatrix[i].size(); j++) {
451 cout << dMatrix[i][j] << '\t';
456 catch(exception& e) {
457 m->errorOut(e, "ClusterClassic", "updateMap");
461 /***********************************************************************/