]> git.donarmstrong.com Git - mothur.git/blob - clusterclassic.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / clusterclassic.cpp
1 /*
2  *  clusterclassic.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/29/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterclassic.h"
11 #include "progress.hpp"
12
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f, bool s) : method(f), smallDist(1e6), nseqs(0), sim(s) {
15         try {
16                 mapWanted = false;  //set to true by mgcluster to speed up overlap merge
17         
18                 //save so you can modify as it changes in average neighbor
19                 cutoff = c;
20                 aboveCutoff = cutoff + 10000.0;
21                 m = MothurOut::getInstance();
22         if(method == "furthest")        {   tag = "fn";   }
23         else if (method == "average")   {   tag = "an";   }
24         else if (method == "weighted")  {   tag = "wn";   }        
25         else if (method == "nearest")   {   tag = "nn";   }
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "ClusterClassic", "ClusterClassic");
29                 exit(1);
30         }
31 }
32 /***********************************************************************/
33 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
34         try {
35                 double distance;
36                 int square;
37                 string name;
38                 vector<string> matrixNames;
39                 
40                 ifstream fileHandle;
41                 m->openInputFile(filename, fileHandle);
42                 
43         string numTest;
44                 fileHandle >> numTest >> name;
45         
46         if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
47         else { convert(numTest, nseqs); }
48
49
50                 matrixNames.push_back(name);
51
52                 if(nameMap == NULL){
53                                 list = new ListVector(nseqs);
54                                 list->set(0, name);
55                 }
56                 else{
57                                 list = new ListVector(nameMap->getListVector());
58                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
59                 }
60                 
61                 //initialize distance matrix to cutoff
62                 dMatrix.resize(nseqs);
63                 //colDist temp(0, 0, aboveCutoff);
64                 //rowSmallDists.resize(nseqs, temp);
65                 for (int i = 1; i < nseqs; i++) {                       
66                         dMatrix[i].resize(i, aboveCutoff);              
67                 }                                                                                               
68                                                                                                                 
69                 
70                 char d;
71                 while((d=fileHandle.get()) != EOF){
72
73                                 if(isalnum(d)){
74                                                 square = 1;
75                                                 fileHandle.putback(d);
76                                                 for(int i=0;i<nseqs;i++){
77                                                                 fileHandle >> distance;
78                                                 }
79                                                 break;
80                                 }
81                                 if(d == '\n'){
82                                                 square = 0;
83                                                 break;
84                                 }
85                 }
86
87                 Progress* reading;
88
89                 if(square == 0){
90
91                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
92
93                                 int        index = 0;
94
95                                 for(int i=1;i<nseqs;i++){
96                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
97                                                 
98                                                 fileHandle >> name;
99                                                 matrixNames.push_back(name);
100                 
101
102                                                 //there's A LOT of repeated code throughout this method...
103                                                 if(nameMap == NULL){
104                                                                 list->set(i, name);
105                                                 
106                                                                 for(int j=0;j<i;j++){
107                                                                 
108                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
109                                                                                 
110                                                                                 fileHandle >> distance;
111                                                         
112                                                                                 if (distance == -1) { distance = 1000000; }
113                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
114                                                                 
115                                                                                 //if(distance < cutoff){
116                                                                                         dMatrix[i][j] = distance;
117                                                                                         if (distance < smallDist) { smallDist = distance; }
118                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
119                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
120                                                                                 //}
121                                                                                 index++;
122                                                                                 reading->update(index);
123                                                                 }
124                                 
125                                                 }
126                                                 else{
127                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
128                                 
129                                                                 for(int j=0;j<i;j++){
130                                                                                 fileHandle >> distance;
131                                                                                 
132                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
133                                 
134                                                                                 if (distance == -1) { distance = 1000000; }
135                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
136                                                                                 
137                                                                                 //if(distance < cutoff){
138                                                                                         if (distance < smallDist) { smallDist = distance; }
139                                                                                         
140                                                                                         int row = nameMap->get(matrixNames[i]);
141                                                                                         int col = nameMap->get(matrixNames[j]);
142                                                                                         
143                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
144                                                                                         else { dMatrix[row][col] = distance; }
145                                                                                         
146                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
147                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
148                                                                                 //}
149                                                                                 index++;
150                                                                                 reading->update(index);
151                                                                 }
152                                                 }
153                                 }
154                 }
155                 else{
156
157                                 reading = new Progress("Reading matrix:     ", nseqs * nseqs);
158                 
159                                 int index = nseqs;
160
161                                 for(int i=1;i<nseqs;i++){
162                                                 fileHandle >> name;                
163                                                 matrixNames.push_back(name);
164                                                 
165                                                 if(nameMap == NULL){
166                                                                 list->set(i, name);
167                                                                 for(int j=0;j<nseqs;j++){
168                                                                                 fileHandle >> distance;
169                                                                                 
170                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
171                                                                                 
172                                                                                 if (distance == -1) { distance = 1000000; }
173                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
174                                                                                 
175                                                                                 if(j < i){
176                                                                                         if (distance < smallDist) { smallDist = distance; }
177                                                                                         
178                                                                                         dMatrix[i][j] = distance;
179                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
180                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
181                                                                                 }
182                                                                                 index++;
183                                                                                 reading->update(index);
184                                                                 }
185                                                 
186                                                 }
187                                                 else{
188                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
189                                 
190                                                                 for(int j=0;j<nseqs;j++){
191                                                                                 fileHandle >> distance;
192                                                                                 
193                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
194                                                                                 
195                                                                            if (distance == -1) { distance = 1000000; }
196                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
197                                                                                 
198                                                                                 if(j < i){
199                                                                                         if (distance < smallDist) { smallDist = distance; }
200                                                                                         
201                                                                                         int row = nameMap->get(matrixNames[i]);
202                                                                                         int col = nameMap->get(matrixNames[j]);
203                                                                                         
204                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
205                                                                                         else { dMatrix[row][col] = distance; }
206                                                                                         
207                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
208                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
209                                                                                 }
210                                                                                 index++;
211                                                                                 reading->update(index);
212                                                                 }
213                                                 }
214                                 }
215                 }
216                 
217                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
218                 
219                 reading->finish();
220                 delete reading;
221
222                 list->setLabel("0");
223                 rabund = new RAbundVector(list->getRAbundVector());
224                 
225                 fileHandle.close();
226                 
227                 return 0;
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
231                 exit(1);
232         }
233
234 }
235 /***********************************************************************/
236 int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) {
237         try {
238                 double distance;
239                 int square;
240                 string name;
241                 vector<string> matrixNames;
242                 
243                 ifstream fileHandle;
244                 m->openInputFile(filename, fileHandle);
245                 
246         string numTest;
247                 fileHandle >> numTest >> name;
248         
249         if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
250         else { convert(numTest, nseqs); }
251         
252         
253                 matrixNames.push_back(name);
254         
255                 if(countTable == NULL){
256             list = new ListVector(nseqs);
257             list->set(0, name);
258         }
259         else{  list = new ListVector(countTable->getListVector()); }
260
261                 
262                 //initialize distance matrix to cutoff
263                 dMatrix.resize(nseqs);
264                 //rowSmallDists.resize(nseqs, temp);
265                 for (int i = 1; i < nseqs; i++) {                       
266                         dMatrix[i].resize(i, aboveCutoff);              
267                 }                                                                                               
268         
269                 
270                 char d;
271                 while((d=fileHandle.get()) != EOF){
272             
273             if(isalnum(d)){
274                 square = 1;
275                 fileHandle.putback(d);
276                 for(int i=0;i<nseqs;i++){
277                     fileHandle >> distance;
278                 }
279                 break;
280             }
281             if(d == '\n'){
282                 square = 0;
283                 break;
284             }
285                 }
286         
287                 Progress* reading;
288         
289                 if(square == 0){
290             
291             reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
292             
293             int        index = 0;
294             
295             for(int i=1;i<nseqs;i++){
296                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
297                 
298                 fileHandle >> name;
299                 matrixNames.push_back(name);
300                 
301                 
302                 //there's A LOT of repeated code throughout this method...
303                  if(countTable == NULL){
304                     list->set(i, name);
305                     
306                     for(int j=0;j<i;j++){
307                         
308                         if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
309                         
310                         fileHandle >> distance;
311                         
312                         if (distance == -1) { distance = 1000000; }
313                         else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
314                         
315                         //if(distance < cutoff){
316                         dMatrix[i][j] = distance;
317                         if (distance < smallDist) { smallDist = distance; }
318                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
319                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
320                         //}
321                         index++;
322                         reading->update(index);
323                     }
324                     
325                 }
326                 else{
327                     for(int j=0;j<i;j++){
328                         fileHandle >> distance;
329                         
330                         if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
331                         
332                         if (distance == -1) { distance = 1000000; }
333                         else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
334                         
335                         if (distance < smallDist) { smallDist = distance; }
336                         
337                         int row = countTable->get(matrixNames[i]);
338                         int col = countTable->get(matrixNames[j]);
339                        
340                         if (row < col) {  dMatrix[col][row] = distance; }
341                         else { dMatrix[row][col] = distance; }
342                         
343                         index++;
344                         reading->update(index);
345                     }
346                 }
347             }
348                 }
349                 else{
350             
351             reading = new Progress("Reading matrix:     ", nseqs * nseqs);
352             
353             int index = nseqs;
354             
355             for(int i=1;i<nseqs;i++){
356                 fileHandle >> name;                
357                 matrixNames.push_back(name);
358                 
359                 if(countTable == NULL){
360                     list->set(i, name);
361                     for(int j=0;j<nseqs;j++){
362                         fileHandle >> distance;
363                         
364                         if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
365                         
366                         if (distance == -1) { distance = 1000000; }
367                         else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
368                         
369                         if(j < i){
370                             if (distance < smallDist) { smallDist = distance; }
371                             
372                             dMatrix[i][j] = distance;
373                         }
374                         index++;
375                         reading->update(index);
376                     }
377                     
378                 }
379                 else{
380                     
381                     for(int j=0;j<nseqs;j++){
382                         fileHandle >> distance;
383                         
384                         if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
385                         
386                         if (distance == -1) { distance = 1000000; }
387                         else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
388                         
389                         if(j < i){
390                             if (distance < smallDist) { smallDist = distance; }
391                             
392                             int row = countTable->get(matrixNames[i]);
393                             int col = countTable->get(matrixNames[j]);
394                             
395                             if (row < col) {  dMatrix[col][row] = distance; }
396                             else { dMatrix[row][col] = distance; }
397                         }
398                         index++;
399                         reading->update(index);
400                     }
401                 }
402             }
403                 }
404                 
405                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
406                 
407                 reading->finish();
408                 delete reading;
409         
410                 list->setLabel("0");
411                 rabund = new RAbundVector();
412         rabund->setLabel(list->getLabel());  
413         
414         for(int i = 0; i < list->getNumBins(); i++) { 
415             if (m->control_pressed) { break; }
416             vector<string> binNames;
417             string bin = list->get(i);
418             m->splitAtComma(bin, binNames);
419             int total = 0;
420             for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]);  }
421             rabund->push_back(total);   
422         }
423                 
424                 fileHandle.close();
425                 
426                 return 0;
427         }
428         catch(exception& e) {
429                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
430                 exit(1);
431         }
432     
433 }
434 /***********************************************************************/
435 //sets smallCol and smallRow, returns distance
436 double ClusterClassic::getSmallCell() {
437         try {
438                         
439                 smallDist = aboveCutoff;
440                 smallRow = 1;
441                 smallCol = 0;
442                 
443                 vector<colDist> mins;
444                 
445                 for(int i=1;i<nseqs;i++){
446                         for(int j=0;j<i;j++){ 
447                                 if (dMatrix[i][j] < smallDist) {
448                                         mins.clear();
449                                         colDist temp(i, j, dMatrix[i][j]);
450                                         mins.push_back(temp); 
451                                         smallDist = dMatrix[i][j];
452                                 }else if (dMatrix[i][j] == smallDist) {
453                                         colDist temp(i, j, dMatrix[i][j]);
454                                         mins.push_back(temp); 
455
456                                 }
457                         }
458                 }
459                 
460                 if (mins.size() > 0) {
461                         int zrand = 0;
462                         if (mins.size() > 1) {
463                                 //pick random number between 0 and mins.size()
464                                 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
465                         }
466                         
467                         smallRow = mins[zrand].row;
468                         smallCol = mins[zrand].col;
469                 
470                 }
471         //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
472                 //eliminate smallCell
473                 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
474                 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
475                 
476                 return smallDist;
477                 
478         }
479         catch(exception& e) {
480                 m->errorOut(e, "ClusterClassic", "getSmallCell");
481                 exit(1);
482         }
483 }
484 /***********************************************************************/
485 void ClusterClassic::clusterBins(){
486         try {
487         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
488
489                 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));     
490                 rabund->set(smallCol, 0);       
491                 /*for (int i = smallCol+1; i < rabund->size(); i++) {
492                         rabund->set((i-1), rabund->get(i));
493                 }
494                 rabund->resize((rabund->size()-1));*/
495                 rabund->setLabel(toString(smallDist));
496
497         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
498         }
499         catch(exception& e) {
500                 m->errorOut(e, "ClusterClassic", "clusterBins");
501                 exit(1);
502         }
503 }
504 /***********************************************************************/
505 void ClusterClassic::clusterNames(){
506         try {
507         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
508                 if (mapWanted) {  updateMap();  }
509                 
510                 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
511                 list->set(smallCol, "");        
512                 /*for (int i = smallCol+1; i < list->size(); i++) {
513                         list->set((i-1), list->get(i));
514                 }
515                 list->resize((list->size()-1));*/
516                 list->setLabel(toString(smallDist));
517         
518         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
519     }
520         catch(exception& e) {
521                 m->errorOut(e, "ClusterClassic", "clusterNames");
522                 exit(1);
523         }
524 }
525 /***********************************************************************/
526 void ClusterClassic::update(double& cutOFF){
527         try {
528 //print();              
529                 getSmallCell();
530                 
531                 int r, c;
532                 r = smallRow; c = smallCol;
533                                 
534                 for(int i=0;i<nseqs;i++){
535                         if(i != r && i != c){
536                                 double distRow, distCol, newDist;
537                                 if (i > r) { distRow = dMatrix[i][r]; }
538                                 else { distRow =  dMatrix[r][i]; }
539
540                                 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
541                                 else { distCol =  dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
542                                 
543                                 if(method == "furthest"){
544                                         newDist = max(distRow, distCol);
545                                 }
546                                 else if (method == "average"){
547                                         int rowBin = rabund->get(r);
548                                         int colBin = rabund->get(c);
549                                         newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
550                                 }
551                                 else if (method == "weighted"){
552                                         newDist = (distCol + distRow) / 2.0;
553                                 }
554                                 else if (method == "nearest"){
555                                         newDist = min(distRow, distCol);
556                                 }
557                                 //cout << "newDist = " << newDist << endl;      
558                                 if (i > r) { dMatrix[i][r] = newDist; }
559                                 else { dMatrix[r][i] = newDist; }
560                                 
561                         }
562                 }
563                         
564                 clusterBins();
565                 clusterNames();
566                 
567                 //resize each row
568                 /*for(int i=0;i<nseqs;i++){
569                         for(int j=c+1;j<dMatrix[i].size();j++){
570                                 dMatrix[i][j-1]=dMatrix[i][j];
571                         }
572                 }                       
573                 
574                 //resize each col
575                 for(int i=c+1;i<nseqs;i++){
576                         for(int j=0;j < dMatrix[i-1].size();j++){
577                                 dMatrix[i-1][j]=dMatrix[i][j];
578                         }
579                 }       
580                 
581                 nseqs--;
582                 dMatrix.pop_back();*/
583
584         }
585         catch(exception& e) {
586                 m->errorOut(e, "ClusterClassic", "update");
587                 exit(1);
588         }
589 }
590 /***********************************************************************/
591 void ClusterClassic::setMapWanted(bool f)  {  
592         try {
593                 mapWanted = f;
594                 
595                 //initialize map
596                 for (int i = 0; i < list->getNumBins(); i++) {
597                         
598                         //parse bin 
599                         string names = list->get(i);
600                         vector<string> binnames;
601             m->splitAtComma(names, binnames);
602             for (int j = 0; j < binnames.size(); j++) {
603                                 //save name and bin number
604                                 seq2Bin[binnames[j]] = i;
605                         }
606                 }
607                 
608         }
609         catch(exception& e) {
610                 m->errorOut(e, "ClusterClassic", "setMapWanted");
611                 exit(1);
612         }
613 }
614 /***********************************************************************/
615 void ClusterClassic::updateMap() {
616 try {
617                 //update location of seqs in smallRow since they move to smallCol now
618         string names = list->get(smallRow);
619         vector<string> binnames;
620         m->splitAtComma(names, binnames);
621         for (int j = 0; j < binnames.size(); j++) {
622             //save name and bin number
623             seq2Bin[binnames[j]] = smallCol;
624         }
625                 
626         }
627         catch(exception& e) {
628                 m->errorOut(e, "ClusterClassic", "updateMap");
629                 exit(1);
630         }
631 }
632 /***********************************************************************/
633 void ClusterClassic::print() {
634 try {
635                 //update location of seqs in smallRow since they move to smallCol now
636                 for (int i = 0; i < dMatrix.size(); i++) {
637                         m->mothurOut("row = " + toString(i) + "\t");
638                         for (int j = 0; j < dMatrix[i].size(); j++) {
639                                 m->mothurOut(toString(dMatrix[i][j]) + "\t");
640                         }
641                         m->mothurOutEndLine();
642                 }
643         }
644         catch(exception& e) {
645                 m->errorOut(e, "ClusterClassic", "updateMap");
646                 exit(1);
647         }
648 }
649 /***********************************************************************/
650