]> git.donarmstrong.com Git - mothur.git/blob - clusterclassic.cpp
changed cluster.classic so that it does not adjust the cutoff
[mothur.git] / clusterclassic.cpp
1 /*
2  *  clusterclassic.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/29/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterclassic.h"
11 #include "progress.hpp"
12
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), nseqs(0) {
15         try {
16                 mapWanted = false;  //set to true by mgcluster to speed up overlap merge
17         
18                 //save so you can modify as it changes in average neighbor
19                 cutoff = c;
20                 aboveCutoff = cutoff + 10000.0;
21                 m = MothurOut::getInstance();
22                 globaldata = GlobalData::getInstance();
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ClusterClassic", "ClusterClassic");
26                 exit(1);
27         }
28 }
29 /***********************************************************************/
30 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
31         try {
32                 double distance;
33                 int square;
34                 string name;
35                 vector<string> matrixNames;
36                 
37                 ifstream fileHandle;
38                 m->openInputFile(filename, fileHandle);
39                 
40                 fileHandle >> nseqs >> name;
41
42                 matrixNames.push_back(name);
43
44                 if(nameMap == NULL){
45                                 list = new ListVector(nseqs);
46                                 list->set(0, name);
47                 }
48                 else{
49                                 list = new ListVector(nameMap->getListVector());
50                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
51                 }
52                 
53                 //initialize distance matrix to cutoff
54                 dMatrix.resize(nseqs);
55                 //colDist temp(0, 0, aboveCutoff);
56                 //rowSmallDists.resize(nseqs, temp);
57                 for (int i = 1; i < nseqs; i++) {                       
58                         dMatrix[i].resize(i, aboveCutoff);              
59                 }                                                                                               
60                                                                                                                 
61                 
62                 char d;
63                 while((d=fileHandle.get()) != EOF){
64
65                                 if(isalnum(d)){
66                                                 square = 1;
67                                                 fileHandle.putback(d);
68                                                 for(int i=0;i<nseqs;i++){
69                                                                 fileHandle >> distance;
70                                                 }
71                                                 break;
72                                 }
73                                 if(d == '\n'){
74                                                 square = 0;
75                                                 break;
76                                 }
77                 }
78
79                 Progress* reading;
80
81                 if(square == 0){
82
83                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
84
85                                 int        index = 0;
86
87                                 for(int i=1;i<nseqs;i++){
88                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
89                                                 
90                                                 fileHandle >> name;
91                                                 matrixNames.push_back(name);
92                 
93
94                                                 //there's A LOT of repeated code throughout this method...
95                                                 if(nameMap == NULL){
96                                                                 list->set(i, name);
97                                                 
98                                                                 for(int j=0;j<i;j++){
99                                                                 
100                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
101                                                                                 
102                                                                                 fileHandle >> distance;
103                                                         
104                                                                                 if (distance == -1) { distance = 1000000; }
105                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
106                                                                 
107                                                                                 //if(distance < cutoff){
108                                                                                         dMatrix[i][j] = distance;
109                                                                                         if (distance < smallDist) { smallDist = distance; }
110                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
111                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
112                                                                                 //}
113                                                                                 index++;
114                                                                                 reading->update(index);
115                                                                 }
116                                 
117                                                 }
118                                                 else{
119                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
120                                 
121                                                                 for(int j=0;j<i;j++){
122                                                                                 fileHandle >> distance;
123                                                                                 
124                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
125                                 
126                                                                                 if (distance == -1) { distance = 1000000; }
127                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
128                                                                                 
129                                                                                 //if(distance < cutoff){
130                                                                                         if (distance < smallDist) { smallDist = distance; }
131                                                                                         
132                                                                                         int row = nameMap->get(matrixNames[i]);
133                                                                                         int col = nameMap->get(matrixNames[j]);
134                                                                                         
135                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
136                                                                                         else { dMatrix[row][col] = distance; }
137                                                                                         
138                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
139                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
140                                                                                 //}
141                                                                                 index++;
142                                                                                 reading->update(index);
143                                                                 }
144                                                 }
145                                 }
146                 }
147                 else{
148
149                                 reading = new Progress("Reading matrix:     ", nseqs * nseqs);
150                 
151                                 int index = nseqs;
152
153                                 for(int i=1;i<nseqs;i++){
154                                                 fileHandle >> name;                
155                                                 matrixNames.push_back(name);
156                                                 
157                                                 if(nameMap == NULL){
158                                                                 list->set(i, name);
159                                                                 for(int j=0;j<nseqs;j++){
160                                                                                 fileHandle >> distance;
161                                                                                 
162                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
163                                                                                 
164                                                                                 if (distance == -1) { distance = 1000000; }
165                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
166                                                                                 
167                                                                                 if(j < i){
168                                                                                         if (distance < smallDist) { smallDist = distance; }
169                                                                                         
170                                                                                         dMatrix[i][j] = distance;
171                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
172                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
173                                                                                 }
174                                                                                 index++;
175                                                                                 reading->update(index);
176                                                                 }
177                                                 
178                                                 }
179                                                 else{
180                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
181                                 
182                                                                 for(int j=0;j<nseqs;j++){
183                                                                                 fileHandle >> distance;
184                                                                                 
185                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
186                                                                                 
187                                                                            if (distance == -1) { distance = 1000000; }
188                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
189                                                                                 
190                                                                                 if(j < i){
191                                                                                         if (distance < smallDist) { smallDist = distance; }
192                                                                                         
193                                                                                         int row = nameMap->get(matrixNames[i]);
194                                                                                         int col = nameMap->get(matrixNames[j]);
195                                                                                         
196                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
197                                                                                         else { dMatrix[row][col] = distance; }
198                                                                                         
199                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
200                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
201                                                                                 }
202                                                                                 index++;
203                                                                                 reading->update(index);
204                                                                 }
205                                                 }
206                                 }
207                 }
208                 
209                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
210                 
211                 reading->finish();
212                 delete reading;
213
214                 list->setLabel("0");
215                 rabund = new RAbundVector(list->getRAbundVector());
216                 
217                 fileHandle.close();
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
221                 exit(1);
222         }
223
224 }
225 /***********************************************************************/
226 //sets smallCol and smallRow, returns distance
227 double ClusterClassic::getSmallCell() {
228         try {
229                 
230                 smallDist = aboveCutoff;
231                 smallRow = 1;
232                 smallCol = 0;
233                 
234                 vector<colDist> mins;
235                 
236                 for(int i=1;i<nseqs;i++){
237                         for(int j=0;j<i;j++){ 
238                                 if (dMatrix[i][j] < smallDist) {
239                                         mins.clear();
240                                         colDist temp(i, j, dMatrix[i][j]);
241                                         mins.push_back(temp); 
242                                         smallDist = dMatrix[i][j];
243                                 }else if (dMatrix[i][j] == smallDist) {
244                                         colDist temp(i, j, dMatrix[i][j]);
245                                         mins.push_back(temp); 
246
247                                 }
248                         }
249                 }
250                 
251                 if (mins.size() > 0) {
252                         int zrand = 0;
253                         if (mins.size() > 1) {
254                                 //pick random number between 0 and mins.size()
255                                 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
256                         }
257                         
258                         smallRow = mins[zrand].row;
259                         smallCol = mins[zrand].col;
260                 
261                 }
262         //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
263                 //eliminate smallCell
264                 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
265                 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
266                 
267                 return smallDist;
268                 
269         }
270         catch(exception& e) {
271                 m->errorOut(e, "ClusterClassic", "getSmallCell");
272                 exit(1);
273         }
274 }
275 /***********************************************************************/
276 void ClusterClassic::clusterBins(){
277         try {
278         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
279
280                 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));     
281                 rabund->set(smallCol, 0);       
282                 rabund->setLabel(toString(smallDist));
283
284         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "ClusterClassic", "clusterBins");
288                 exit(1);
289         }
290 }
291 /***********************************************************************/
292 void ClusterClassic::clusterNames(){
293         try {
294         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
295                 if (mapWanted) {  updateMap();  }
296                 
297                 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
298                 list->set(smallCol, "");        
299                 list->setLabel(toString(smallDist));
300         
301         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
302     }
303         catch(exception& e) {
304                 m->errorOut(e, "ClusterClassic", "clusterNames");
305                 exit(1);
306         }
307 }
308 /***********************************************************************/
309 void ClusterClassic::update(double& cutOFF){
310         try {
311 //cout << "before update " << endl;
312 //print();              
313                 getSmallCell();
314                 
315                 int r, c;
316                 r = smallRow; c = smallCol;
317                 //because we only store lt, we need to make sure we grab the right location
318                 //if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value
319                 //else { r = smallRow; c = smallCol; } //smallRow is the row value
320                 
321                 //reset rows smallest distance
322                 //rowSmallDists[r].dist = aboveCutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0;
323                 //rowSmallDists[c].dist = aboveCutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0;
324                 
325                 //if your rows smallest distance is from smallRow or smallCol, reset
326                 //for(int i=0;i<nseqs;i++){
327                         //if ((rowSmallDists[i].row == r) || (rowSmallDists[i].row == c) || (rowSmallDists[i].col == r) || (rowSmallDists[i].col == c)) {
328                         //      rowSmallDists[i].dist = aboveCutoff; rowSmallDists[i].row = 0; rowSmallDists[i].col = 0;
329                         //}
330                 //}
331                 
332                 for(int i=0;i<nseqs;i++){
333                         if(i != r && i != c){
334                                 double distRow, distCol, newDist;
335                                 if (i > r) { distRow = dMatrix[i][r]; }
336                                 else { distRow =  dMatrix[r][i]; }
337                                 
338                                 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
339                                 else { distCol =  dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
340                                         
341                                 if(method == "furthest"){
342                                         newDist = max(distRow, distCol);
343                                 }
344                                 else if (method == "average"){
345                                         int rowBin = rabund->get(r);
346                                         int colBin = rabund->get(c);
347                                         newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
348                                 }
349                                 else if (method == "weighted"){
350                                         newDist = (distCol + distRow) / 2.0;
351                                 }
352                                 else if (method == "nearest"){
353                                         newDist = min(distRow, distCol);
354                                 }
355                                 
356                                 if (i > r) { dMatrix[i][r] = newDist; }
357                                 else { dMatrix[r][i] = newDist; }
358                                 
359                                 //if (newDist < rowSmallDists[i].dist) {  rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i;  }
360                         }
361                         //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl;  
362                 }
363                         
364                 clusterBins();
365                 clusterNames();
366         
367                 //find new small for 2 rows we just merged
368                 //colDist temp(0,0,100.0);
369                 //rowSmallDists[r] = temp;
370
371                 //for (int i = 0; i < dMatrix[r].size(); i++) {
372                 //      if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
373                 //}
374                 //for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) {
375                 //      if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
376                 //}
377                 
378                 //rowSmallDists[c] = temp;
379                 //for (int i = 0; i < dMatrix[c].size(); i++) {
380                 //      if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
381                 //}
382                 //for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) {
383                 //      if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
384                 //}
385                 
386                 //cout << "after update " << endl;
387                 //print();
388         }
389         catch(exception& e) {
390                 m->errorOut(e, "ClusterClassic", "update");
391                 exit(1);
392         }
393 }
394 /***********************************************************************/
395 void ClusterClassic::setMapWanted(bool f)  {  
396         try {
397                 mapWanted = f;
398                 
399                 //initialize map
400                 for (int i = 0; i < list->getNumBins(); i++) {
401                         
402                         //parse bin 
403                         string names = list->get(i);
404                         while (names.find_first_of(',') != -1) { 
405                                 //get name from bin
406                                 string name = names.substr(0,names.find_first_of(','));
407                                 //save name and bin number
408                                 seq2Bin[name] = i;
409                                 names = names.substr(names.find_first_of(',')+1, names.length());
410                         }
411                         
412                         //get last name
413                         seq2Bin[names] = i;
414                 }
415                 
416         }
417         catch(exception& e) {
418                 m->errorOut(e, "ClusterClassic", "setMapWanted");
419                 exit(1);
420         }
421 }
422 /***********************************************************************/
423 void ClusterClassic::updateMap() {
424 try {
425                 //update location of seqs in smallRow since they move to smallCol now
426                 string names = list->get(smallRow);
427                 while (names.find_first_of(',') != -1) { 
428                         //get name from bin
429                         string name = names.substr(0,names.find_first_of(','));
430                         //save name and bin number
431                         seq2Bin[name] = smallCol;
432                         names = names.substr(names.find_first_of(',')+1, names.length());
433                 }
434                         
435                 //get last name
436                 seq2Bin[names] = smallCol;
437                 
438         }
439         catch(exception& e) {
440                 m->errorOut(e, "ClusterClassic", "updateMap");
441                 exit(1);
442         }
443 }
444 /***********************************************************************/
445 void ClusterClassic::print() {
446 try {
447                 //update location of seqs in smallRow since they move to smallCol now
448                 for (int i = 0; i < dMatrix.size(); i++) {
449                         cout << "row = " << i << '\t';
450                         for (int j = 0; j < dMatrix[i].size(); j++) {
451                                 cout << dMatrix[i][j] << '\t';
452                         }
453                         cout << endl;
454                 }
455         }
456         catch(exception& e) {
457                 m->errorOut(e, "ClusterClassic", "updateMap");
458                 exit(1);
459         }
460 }
461 /***********************************************************************/
462