]> git.donarmstrong.com Git - mothur.git/blob - clusterclassic.cpp
added cluster.classic command
[mothur.git] / clusterclassic.cpp
1 /*
2  *  clusterclassic.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/29/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterclassic.h"
11 #include "progress.hpp"
12
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f) : method(f), smallDist(1e6), nseqs(0) {
15         try {
16                 mapWanted = false;  //set to true by mgcluster to speed up overlap merge
17         
18                 //save so you can modify as it changes in average neighbor
19                 cutoff = c;
20                 m = MothurOut::getInstance();
21                 globaldata = GlobalData::getInstance();
22         }
23         catch(exception& e) {
24                 m->errorOut(e, "ClusterClassic", "ClusterClassic");
25                 exit(1);
26         }
27 }
28 /***********************************************************************/
29 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
30         try {
31                 double distance;
32                 int square;
33                 string name;
34                 vector<string> matrixNames;
35                 
36                 ifstream fileHandle;
37                 m->openInputFile(filename, fileHandle);
38                 
39                 fileHandle >> nseqs >> name;
40
41                 matrixNames.push_back(name);
42
43                 if(nameMap == NULL){
44                                 list = new ListVector(nseqs);
45                                 list->set(0, name);
46                 }
47                 else{
48                                 list = new ListVector(nameMap->getListVector());
49                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
50                 }
51                 
52                 //initialize distance matrix to cutoff
53                 dMatrix.resize(nseqs);
54                 colDist temp(0, 0, cutoff);
55                 rowSmallDists.resize(nseqs, temp);
56                 for (int i = 1; i < nseqs; i++) {                       
57                         dMatrix[i].resize(i, cutoff);           
58                 }                                                                                               
59                                                                                                                 
60                 
61                 char d;
62                 while((d=fileHandle.get()) != EOF){
63
64                                 if(isalnum(d)){
65                                                 square = 1;
66                                                 fileHandle.putback(d);
67                                                 for(int i=0;i<nseqs;i++){
68                                                                 fileHandle >> distance;
69                                                 }
70                                                 break;
71                                 }
72                                 if(d == '\n'){
73                                                 square = 0;
74                                                 break;
75                                 }
76                 }
77
78                 Progress* reading;
79
80                 if(square == 0){
81
82                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
83
84                                 int        index = 0;
85
86                                 for(int i=1;i<nseqs;i++){
87                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
88                                                 
89                                                 fileHandle >> name;
90                                                 matrixNames.push_back(name);
91                 
92
93                                                 //there's A LOT of repeated code throughout this method...
94                                                 if(nameMap == NULL){
95                                                                 list->set(i, name);
96                                                 
97                                                                 for(int j=0;j<i;j++){
98                                                                 
99                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
100                                                                                 
101                                                                                 fileHandle >> distance;
102                                                         
103                                                                                 if (distance == -1) { distance = 1000000; }
104                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
105                                                                 
106                                                                                 if(distance < cutoff){
107                                                                                         dMatrix[i][j] = distance;
108                                                                                         if (distance < smallDist) { smallDist = distance; }
109                                                                                         if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
110                                                                                         if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
111                                                                                 }
112                                                                                 index++;
113                                                                                 reading->update(index);
114                                                                 }
115                                 
116                                                 }
117                                                 else{
118                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
119                                 
120                                                                 for(int j=0;j<i;j++){
121                                                                                 fileHandle >> distance;
122                                                                                 
123                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
124                                 
125                                                                                 if (distance == -1) { distance = 1000000; }
126                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
127                                                                                 
128                                                                                 if(distance < cutoff){
129                                                                                         if (distance < smallDist) { smallDist = distance; }
130                                                                                         
131                                                                                         int row = nameMap->get(matrixNames[i]);
132                                                                                         int col = nameMap->get(matrixNames[j]);
133                                                                                         
134                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
135                                                                                         else { dMatrix[row][col] = distance; }
136                                                                                         
137                                                                                         if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
138                                                                                         if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
139                                                                                 }
140                                                                                 index++;
141                                                                                 reading->update(index);
142                                                                 }
143                                                 }
144                                 }
145                 }
146                 else{
147
148                                 reading = new Progress("Reading matrix:     ", nseqs * nseqs);
149                 
150                                 int index = nseqs;
151
152                                 for(int i=1;i<nseqs;i++){
153                                                 fileHandle >> name;                
154                                                 matrixNames.push_back(name);
155                                                 
156                                                 if(nameMap == NULL){
157                                                                 list->set(i, name);
158                                                                 for(int j=0;j<nseqs;j++){
159                                                                                 fileHandle >> distance;
160                                                                                 
161                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
162                                                                                 
163                                                                                 if (distance == -1) { distance = 1000000; }
164                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
165                                                                                 
166                                                                                 if(distance < cutoff && j < i){
167                                                                                         if (distance < smallDist) { smallDist = distance; }
168                                                                                         
169                                                                                         dMatrix[i][j] = distance;
170                                                                                         if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
171                                                                                         if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
172                                                                                 }
173                                                                                 index++;
174                                                                                 reading->update(index);
175                                                                 }
176                                                 
177                                                 }
178                                                 else{
179                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
180                                 
181                                                                 for(int j=0;j<nseqs;j++){
182                                                                                 fileHandle >> distance;
183                                                                                 
184                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
185                                                                                 
186                                                                            if (distance == -1) { distance = 1000000; }
187                                                                                 else if (globaldata->sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
188                                                                                 
189                                                                                 if(distance < cutoff && j < i){
190                                                                                         if (distance < smallDist) { smallDist = distance; }
191                                                                                         
192                                                                                         int row = nameMap->get(matrixNames[i]);
193                                                                                         int col = nameMap->get(matrixNames[j]);
194                                                                                         
195                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
196                                                                                         else { dMatrix[row][col] = distance; }
197                                                                                         
198                                                                                         if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
199                                                                                         if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
200                                                                                 }
201                                                                                 index++;
202                                                                                 reading->update(index);
203                                                                 }
204                                                 }
205                                 }
206                 }
207                 
208                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
209                 
210                 reading->finish();
211                 delete reading;
212
213                 list->setLabel("0");
214                 rabund = new RAbundVector(list->getRAbundVector());
215                 
216                 fileHandle.close();
217         }
218         catch(exception& e) {
219                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
220                 exit(1);
221         }
222
223 }
224 /***********************************************************************/
225 //sets smallCol and smallRow, returns distance
226 double ClusterClassic::getSmallCell() {
227         try {
228                 
229                 smallDist = cutoff;
230                 smallRow = 1;
231                 smallCol = 0;
232                 
233                 vector<colDist> mins;
234                 
235                 for(int i=0;i<nseqs;i++){ 
236
237                         if (rowSmallDists[i].dist < smallDist) {
238                                 mins.clear();
239                                 mins.push_back(rowSmallDists[i]); 
240                                 smallDist = rowSmallDists[i].dist;
241                         }else if (rowSmallDists[i].dist == smallDist) {
242                                 mins.push_back(rowSmallDists[i]);
243                         }
244                 }
245                 
246                 if (mins.size() > 0) {
247                         int zrand = 0;
248                         if (mins.size() > 1) {
249                                 //pick random number between 0 and mins.size()
250                                 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
251                         }
252                         
253                         smallRow = mins[zrand].row;
254                         smallCol = mins[zrand].col;
255                 
256                 }
257         //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
258         
259                 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = cutoff; }
260                 else { dMatrix[smallRow][smallCol] = cutoff; }
261                 
262                 return smallDist;
263                 
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "ClusterClassic", "getSmallCell");
267                 exit(1);
268         }
269 }
270 /***********************************************************************/
271 void ClusterClassic::clusterBins(){
272         try {
273         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
274
275                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
276                 rabund->set(smallRow, 0);       
277                 rabund->setLabel(toString(smallDist));
278
279         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "ClusterClassic", "clusterBins");
283                 exit(1);
284         }
285 }
286 /***********************************************************************/
287 void ClusterClassic::clusterNames(){
288         try {
289         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
290                 if (mapWanted) {  updateMap();  }
291                 
292                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
293                 list->set(smallRow, "");        
294                 list->setLabel(toString(smallDist));
295         
296         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
297     }
298         catch(exception& e) {
299                 m->errorOut(e, "ClusterClassic", "clusterNames");
300                 exit(1);
301         }
302 }
303 /***********************************************************************/
304 void ClusterClassic::update(double& cutOFF){
305         try {
306 //cout << "before update " << endl;
307 //print();              
308                 getSmallCell();
309                 
310                 int r, c;
311                 //because we only store lt, we need to make sure we grab the right location
312                 if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value
313                 else { r = smallRow; c = smallCol; } //smallRow is the row value
314                 
315                 //reset rows smallest distance
316                 rowSmallDists[r].dist = cutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0;
317                 rowSmallDists[c].dist = cutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0;
318                 
319                 //if your rows smallest distance is from smallRow or smallCol, reset
320                 for(int i=0;i<nseqs;i++){
321                         if ((rowSmallDists[i].row == r) || (rowSmallDists[i].row == c) || (rowSmallDists[i].col == r) || (rowSmallDists[i].col == c)) {
322                                 rowSmallDists[i].dist = cutoff; rowSmallDists[i].row = 0; rowSmallDists[i].col = 0;
323                         }
324                 }
325                 
326                 for(int i=0;i<nseqs;i++){
327                         if(i != r && i != c){
328                                 double distRow, distCol, newDist;
329                                 if (i > r) { distRow = dMatrix[i][r]; }
330                                 else { distRow =  dMatrix[r][i]; }
331                                 
332                                 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = cutoff; } //like removeCell
333                                 else { distCol =  dMatrix[c][i]; dMatrix[c][i] = cutoff; }
334                                         
335                                 if(method == "furthest"){
336                                         newDist = max(distRow, distCol);
337                                 }
338                                 else if (method == "average"){
339                                         if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
340                                                 newDist = cutoff; //eliminate value
341                                         }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff
342                                                 newDist = cutoff; //eliminate value
343                                                 if (cutOFF > distCol) { cutOFF = distCol; }
344                                         }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
345                                                 newDist = cutoff; //eliminate value
346                                                 if (cutOFF > distRow) { cutOFF = distRow; }
347                                         }else {
348                                                 int rowBin = rabund->get(r);
349                                                 int colBin = rabund->get(c);
350                                                 newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
351                                         }
352                                 }
353                                 else if (method == "weighted"){
354                                         if ((distRow == cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
355                                                 newDist = cutoff; //eliminate value
356                                         }else if ((distRow == cutoff) && (distCol != cutoff)) { //you are merging with a value above cutoff
357                                                 newDist = cutoff; //eliminate value
358                                                 if (cutOFF > distCol) { cutOFF = distCol; }
359                                         }else if ((distRow != cutoff) && (distCol == cutoff)) { //you are merging with a value above cutoff
360                                                 newDist = cutoff; //eliminate value
361                                                 if (cutOFF > distRow) { cutOFF = distRow; }
362                                         }else {
363                                                 newDist = (distCol + distRow) / 2.0;
364                                         }
365                                 }
366                                 else if (method == "nearest"){
367                                         newDist = min(distRow, distCol);
368                                 }
369                                 
370                                 if (i > r) { dMatrix[i][r] = newDist; }
371                                 else { dMatrix[r][i] = newDist; }
372                                 
373                                 if (newDist < rowSmallDists[i].dist) {  rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i;  }
374                         }
375                         //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl;  
376                 }
377                         
378                 clusterBins();
379                 clusterNames();
380         
381                 //find new small for 2 rows we just merged
382                 colDist temp(0,0,100.0);
383                 rowSmallDists[r] = temp;
384
385                 for (int i = 0; i < dMatrix[r].size(); i++) {
386                         if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
387                 }
388                 for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) {
389                         if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; }
390                 }
391                 
392                 rowSmallDists[c] = temp;
393                 for (int i = 0; i < dMatrix[c].size(); i++) {
394                         if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
395                 }
396                 for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) {
397                         if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; }
398                 }
399                 
400                 //cout << "after update " << endl;
401                 //print();
402         }
403         catch(exception& e) {
404                 m->errorOut(e, "ClusterClassic", "update");
405                 exit(1);
406         }
407 }
408 /***********************************************************************/
409 void ClusterClassic::setMapWanted(bool f)  {  
410         try {
411                 mapWanted = f;
412                 
413                 //initialize map
414                 for (int i = 0; i < list->getNumBins(); i++) {
415                         
416                         //parse bin 
417                         string names = list->get(i);
418                         while (names.find_first_of(',') != -1) { 
419                                 //get name from bin
420                                 string name = names.substr(0,names.find_first_of(','));
421                                 //save name and bin number
422                                 seq2Bin[name] = i;
423                                 names = names.substr(names.find_first_of(',')+1, names.length());
424                         }
425                         
426                         //get last name
427                         seq2Bin[names] = i;
428                 }
429                 
430         }
431         catch(exception& e) {
432                 m->errorOut(e, "ClusterClassic", "setMapWanted");
433                 exit(1);
434         }
435 }
436 /***********************************************************************/
437 void ClusterClassic::updateMap() {
438 try {
439                 //update location of seqs in smallRow since they move to smallCol now
440                 string names = list->get(smallRow);
441                 while (names.find_first_of(',') != -1) { 
442                         //get name from bin
443                         string name = names.substr(0,names.find_first_of(','));
444                         //save name and bin number
445                         seq2Bin[name] = smallCol;
446                         names = names.substr(names.find_first_of(',')+1, names.length());
447                 }
448                         
449                 //get last name
450                 seq2Bin[names] = smallCol;
451                 
452         }
453         catch(exception& e) {
454                 m->errorOut(e, "ClusterClassic", "updateMap");
455                 exit(1);
456         }
457 }
458 /***********************************************************************/
459 void ClusterClassic::print() {
460 try {
461                 //update location of seqs in smallRow since they move to smallCol now
462                 for (int i = 0; i < dMatrix.size(); i++) {
463                         cout << "row = " << i << '\t';
464                         for (int j = 0; j < dMatrix[i].size(); j++) {
465                                 cout << dMatrix[i][j] << '\t';
466                         }
467                         cout << endl;
468                 }
469         }
470         catch(exception& e) {
471                 m->errorOut(e, "ClusterClassic", "updateMap");
472                 exit(1);
473         }
474 }
475 /***********************************************************************/
476