5 * Created by westcott on 10/29/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clusterclassic.h"
11 #include "progress.hpp"
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), nseqs(0) {
16 mapWanted = false; //set to true by mgcluster to speed up overlap merge
18 //save so you can modify as it changes in average neighbor
20 m = MothurOut::getInstance();
21 globaldata = GlobalData::getInstance();
24 m->errorOut(e, "ClusterClassic", "ClusterClassic");
28 /***********************************************************************/
29 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
34 vector<string> matrixNames;
37 m->openInputFile(filename, fileHandle);
39 fileHandle >> nseqs >> name;
41 matrixNames.push_back(name);
44 list = new ListVector(nseqs);
48 list = new ListVector(nameMap->getListVector());
49 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
52 //initialize distance matrix to cutoff
53 dMatrix.resize(nseqs);
54 colDist temp(0, 0, cutoff);
55 rowSmallDists.resize(nseqs, temp);
56 for (int i = 1; i < nseqs; i++) {
57 dMatrix[i].resize(i, cutoff);
62 while((d=fileHandle.get()) != EOF){
66 fileHandle.putback(d);
67 for(int i=0;i<nseqs;i++){
68 fileHandle >> distance;
82 reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
86 for(int i=1;i<nseqs;i++){
87 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
90 matrixNames.push_back(name);
93 //there's A LOT of repeated code throughout this method...
99 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
101 fileHandle >> distance;
103 if (distance == -1) { distance = 1000000; }
104 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
106 if(distance < cutoff){
107 dMatrix[i][j] = distance;
108 if (distance < smallDist) { smallDist = distance; }
109 if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
110 if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
113 reading->update(index);
118 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
120 for(int j=0;j<i;j++){
121 fileHandle >> distance;
123 if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
125 if (distance == -1) { distance = 1000000; }
126 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
128 if(distance < cutoff){
129 if (distance < smallDist) { smallDist = distance; }
131 int row = nameMap->get(matrixNames[i]);
132 int col = nameMap->get(matrixNames[j]);
134 if (row < col) { dMatrix[col][row] = distance; }
135 else { dMatrix[row][col] = distance; }
137 if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
138 if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
141 reading->update(index);
148 reading = new Progress("Reading matrix: ", nseqs * nseqs);
152 for(int i=1;i<nseqs;i++){
154 matrixNames.push_back(name);
158 for(int j=0;j<nseqs;j++){
159 fileHandle >> distance;
161 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
163 if (distance == -1) { distance = 1000000; }
164 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
166 if(distance < cutoff && j < i){
167 if (distance < smallDist) { smallDist = distance; }
169 dMatrix[i][j] = distance;
170 if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
171 if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
174 reading->update(index);
179 if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
181 for(int j=0;j<nseqs;j++){
182 fileHandle >> distance;
184 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
186 if (distance == -1) { distance = 1000000; }
187 else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
189 if(distance < cutoff && j < i){
190 if (distance < smallDist) { smallDist = distance; }
192 int row = nameMap->get(matrixNames[i]);
193 int col = nameMap->get(matrixNames[j]);
195 if (row < col) { dMatrix[col][row] = distance; }
196 else { dMatrix[row][col] = distance; }
198 if (rowSmallDists[row].dist > distance) { rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
199 if (rowSmallDists[col].dist > distance) { rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
202 reading->update(index);
208 if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
214 rabund = new RAbundVector(list->getRAbundVector());
218 catch(exception& e) {
219 m->errorOut(e, "ClusterClassic", "readPhylipFile");
224 /***********************************************************************/
225 //sets smallCol and smallRow, returns distance
226 double ClusterClassic::getSmallCell() {
233 vector<colDist> mins;
235 for(int i=0;i<nseqs;i++){
237 if (rowSmallDists[i].dist < smallDist) {
239 mins.push_back(rowSmallDists[i]);
240 smallDist = rowSmallDists[i].dist;
241 }else if (rowSmallDists[i].dist == smallDist) {
242 mins.push_back(rowSmallDists[i]);
246 if (mins.size() > 0) {
248 if (mins.size() > 1) {
249 //pick random number between 0 and mins.size()
250 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
253 smallRow = mins[zrand].row;
254 smallCol = mins[zrand].col;
257 //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
259 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = cutoff; }
260 else { dMatrix[smallRow][smallCol] = cutoff; }
265 catch(exception& e) {
266 m->errorOut(e, "ClusterClassic", "getSmallCell");
270 /***********************************************************************/
271 void ClusterClassic::clusterBins(){
273 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
275 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));
276 rabund->set(smallRow, 0);
277 rabund->setLabel(toString(smallDist));
279 // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
281 catch(exception& e) {
282 m->errorOut(e, "ClusterClassic", "clusterBins");
286 /***********************************************************************/
287 void ClusterClassic::clusterNames(){
289 // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
290 if (mapWanted) { updateMap(); }
292 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
293 list->set(smallRow, "");
294 list->setLabel(toString(smallDist));
296 // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
298 catch(exception& e) {
299 m->errorOut(e, "ClusterClassic", "clusterNames");
303 /***********************************************************************/
304 void ClusterClassic::update(double& cutOFF){
306 //cout << "before update " << endl;
311 //because we only store lt, we need to make sure we grab the right location
312 if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value
313 else { r = smallRow; c = smallCol; } //smallRow is the row value
315 //reset rows smallest distance
316 rowSmallDists[r].dist = cutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0;
317 rowSmallDists[c].dist = cutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0;
319 //if your rows smallest distance is from smallRow or smallCol, reset
320 for(int i=0;i<nseqs;i++){
321 if ((rowSmallDists[i].row == r) || (rowSmallDists[i].row == c) || (rowSmallDists[i].col == r) || (rowSmallDists[i].col == c)) {
322 rowSmallDists[i].dist = cutoff; rowSmallDists[i].row = 0; rowSmallDists[i].col = 0;
326 for(int i=0;i<nseqs;i++){
327 if(i != r && i != c){
328 double distRow, distCol, newDist;
329 if (i > r) { distRow = dMatrix[i][r]; }
330 else { distRow = dMatrix[r][i]; }
332 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = cutoff; } //like removeCell
333 else { distCol = dMatrix[c][i]; dMatrix[c][i] = cutoff; }
335 if(method == "furthest"){
336 newDist = max(distRow, distCol);
338 else if (method == "average"){
339 if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
340 newDist = cutoff; //eliminate value
341 }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff
342 newDist = cutoff; //eliminate value
343 if (cutOFF > distCol) { cutOFF = distCol; }
344 }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
345 newDist = cutoff; //eliminate value
346 if (cutOFF > distRow) { cutOFF = distRow; }
348 int rowBin = rabund->get(r);
349 int colBin = rabund->get(c);
350 newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
353 else if (method == "weighted"){
354 if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
355 newDist = cutoff; //eliminate value
356 }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff
357 newDist = cutoff; //eliminate value
358 if (cutOFF > distCol) { cutOFF = distCol; }
359 }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
360 newDist = cutoff; //eliminate value
361 if (cutOFF > distRow) { cutOFF = distRow; }
363 newDist = (distCol + distRow) / 2.0;
366 else if (method == "nearest"){
367 newDist = min(distRow, distCol);
370 if (i > r) { dMatrix[i][r] = newDist; }
371 else { dMatrix[r][i] = newDist; }
373 if (newDist < rowSmallDists[i].dist) { rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i; }
375 //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl;
381 //find new small for 2 rows we just merged
382 colDist temp(0,0,100.0);
383 rowSmallDists[r] = temp;
385 for (int i = 0; i < dMatrix[r].size(); i++) {
386 if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
388 for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) {
389 if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
392 rowSmallDists[c] = temp;
393 for (int i = 0; i < dMatrix[c].size(); i++) {
394 if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
396 for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) {
397 if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
400 //cout << "after update " << endl;
403 catch(exception& e) {
404 m->errorOut(e, "ClusterClassic", "update");
408 /***********************************************************************/
409 void ClusterClassic::setMapWanted(bool f) {
414 for (int i = 0; i < list->getNumBins(); i++) {
417 string names = list->get(i);
418 while (names.find_first_of(',') != -1) {
420 string name = names.substr(0,names.find_first_of(','));
421 //save name and bin number
423 names = names.substr(names.find_first_of(',')+1, names.length());
431 catch(exception& e) {
432 m->errorOut(e, "ClusterClassic", "setMapWanted");
436 /***********************************************************************/
437 void ClusterClassic::updateMap() {
439 //update location of seqs in smallRow since they move to smallCol now
440 string names = list->get(smallRow);
441 while (names.find_first_of(',') != -1) {
443 string name = names.substr(0,names.find_first_of(','));
444 //save name and bin number
445 seq2Bin[name] = smallCol;
446 names = names.substr(names.find_first_of(',')+1, names.length());
450 seq2Bin[names] = smallCol;
453 catch(exception& e) {
454 m->errorOut(e, "ClusterClassic", "updateMap");
458 /***********************************************************************/
459 void ClusterClassic::print() {
461 //update location of seqs in smallRow since they move to smallCol now
462 for (int i = 0; i < dMatrix.size(); i++) {
463 cout << "row = " << i << '\t';
464 for (int j = 0; j < dMatrix[i].size(); j++) {
465 cout << dMatrix[i][j] << '\t';
470 catch(exception& e) {
471 m->errorOut(e, "ClusterClassic", "updateMap");
475 /***********************************************************************/