]> git.donarmstrong.com Git - mothur.git/blob - clusterclassic.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / clusterclassic.cpp
1 /*
2  *  clusterclassic.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/29/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterclassic.h"
11 #include "progress.hpp"
12
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f, bool s) : method(f), smallDist(1e6), nseqs(0), sim(s) {
15         try {
16                 mapWanted = false;  //set to true by mgcluster to speed up overlap merge
17         
18                 //save so you can modify as it changes in average neighbor
19                 cutoff = c;
20                 aboveCutoff = cutoff + 10000.0;
21                 m = MothurOut::getInstance();
22         if(method == "furthest")        {   tag = "fn";   }
23         else if (method == "average")   {   tag = "an";   }
24         else if (method == "weighted")  {   tag = "wn";   }        
25         else if (method == "nearest")   {   tag = "nn";   }
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "ClusterClassic", "ClusterClassic");
29                 exit(1);
30         }
31 }
32 /***********************************************************************/
33 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
34         try {
35                 double distance;
36                 int square;
37                 string name;
38                 vector<string> matrixNames;
39                 
40                 ifstream fileHandle;
41                 m->openInputFile(filename, fileHandle);
42                 
43         string numTest;
44                 fileHandle >> numTest >> name;
45         
46         if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
47         else { convert(numTest, nseqs); }
48
49
50                 matrixNames.push_back(name);
51
52                 if(nameMap == NULL){
53                                 list = new ListVector(nseqs);
54                                 list->set(0, name);
55                 }
56                 else{
57                                 list = new ListVector(nameMap->getListVector());
58                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
59                 }
60                 
61                 //initialize distance matrix to cutoff
62                 dMatrix.resize(nseqs);
63                 //colDist temp(0, 0, aboveCutoff);
64                 //rowSmallDists.resize(nseqs, temp);
65                 for (int i = 1; i < nseqs; i++) {                       
66                         dMatrix[i].resize(i, aboveCutoff);              
67                 }                                                                                               
68                                                                                                                 
69                 
70                 char d;
71                 while((d=fileHandle.get()) != EOF){
72
73                                 if(isalnum(d)){
74                                                 square = 1;
75                                                 fileHandle.putback(d);
76                                                 for(int i=0;i<nseqs;i++){
77                                                                 fileHandle >> distance;
78                                                 }
79                                                 break;
80                                 }
81                                 if(d == '\n'){
82                                                 square = 0;
83                                                 break;
84                                 }
85                 }
86
87                 Progress* reading;
88
89                 if(square == 0){
90
91                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
92
93                                 int        index = 0;
94
95                                 for(int i=1;i<nseqs;i++){
96                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
97                                                 
98                                                 fileHandle >> name;
99                                                 matrixNames.push_back(name);
100                 
101
102                                                 //there's A LOT of repeated code throughout this method...
103                                                 if(nameMap == NULL){
104                                                                 list->set(i, name);
105                                                 
106                                                                 for(int j=0;j<i;j++){
107                                                                 
108                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
109                                                                                 
110                                                                                 fileHandle >> distance;
111                                                         
112                                                                                 if (distance == -1) { distance = 1000000; }
113                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
114                                                                 
115                                                                                 //if(distance < cutoff){
116                                                                                         dMatrix[i][j] = distance;
117                                                                                         if (distance < smallDist) { smallDist = distance; }
118                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
119                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
120                                                                                 //}
121                                                                                 index++;
122                                                                                 reading->update(index);
123                                                                 }
124                                 
125                                                 }
126                                                 else{
127                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
128                                 
129                                                                 for(int j=0;j<i;j++){
130                                                                                 fileHandle >> distance;
131                                                                                 
132                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
133                                 
134                                                                                 if (distance == -1) { distance = 1000000; }
135                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
136                                                                                 
137                                                                                 //if(distance < cutoff){
138                                                                                         if (distance < smallDist) { smallDist = distance; }
139                                                                                         
140                                                                                         int row = nameMap->get(matrixNames[i]);
141                                                                                         int col = nameMap->get(matrixNames[j]);
142                                                                                         
143                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
144                                                                                         else { dMatrix[row][col] = distance; }
145                                                                                         
146                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
147                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
148                                                                                 //}
149                                                                                 index++;
150                                                                                 reading->update(index);
151                                                                 }
152                                                 }
153                                 }
154                 }
155                 else{
156
157                                 reading = new Progress("Reading matrix:     ", nseqs * nseqs);
158                 
159                                 int index = nseqs;
160
161                                 for(int i=1;i<nseqs;i++){
162                                                 fileHandle >> name;                
163                                                 matrixNames.push_back(name);
164                                                 
165                                                 if(nameMap == NULL){
166                                                                 list->set(i, name);
167                                                                 for(int j=0;j<nseqs;j++){
168                                                                                 fileHandle >> distance;
169                                                                                 
170                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
171                                                                                 
172                                                                                 if (distance == -1) { distance = 1000000; }
173                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
174                                                                                 
175                                                                                 if(j < i){
176                                                                                         if (distance < smallDist) { smallDist = distance; }
177                                                                                         
178                                                                                         dMatrix[i][j] = distance;
179                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
180                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
181                                                                                 }
182                                                                                 index++;
183                                                                                 reading->update(index);
184                                                                 }
185                                                 
186                                                 }
187                                                 else{
188                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
189                                 
190                                                                 for(int j=0;j<nseqs;j++){
191                                                                                 fileHandle >> distance;
192                                                                                 
193                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
194                                                                                 
195                                                                            if (distance == -1) { distance = 1000000; }
196                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
197                                                                                 
198                                                                                 if(j < i){
199                                                                                         if (distance < smallDist) { smallDist = distance; }
200                                                                                         
201                                                                                         int row = nameMap->get(matrixNames[i]);
202                                                                                         int col = nameMap->get(matrixNames[j]);
203                                                                                         
204                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
205                                                                                         else { dMatrix[row][col] = distance; }
206                                                                                         
207                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
208                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
209                                                                                 }
210                                                                                 index++;
211                                                                                 reading->update(index);
212                                                                 }
213                                                 }
214                                 }
215                 }
216                 
217                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
218                 
219                 reading->finish();
220                 delete reading;
221
222                 list->setLabel("0");
223                 rabund = new RAbundVector(list->getRAbundVector());
224                 
225                 fileHandle.close();
226                 
227                 return 0;
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
231                 exit(1);
232         }
233
234 }
235 /***********************************************************************/
236 //sets smallCol and smallRow, returns distance
237 double ClusterClassic::getSmallCell() {
238         try {
239                         
240                 smallDist = aboveCutoff;
241                 smallRow = 1;
242                 smallCol = 0;
243                 
244                 vector<colDist> mins;
245                 
246                 for(int i=1;i<nseqs;i++){
247                         for(int j=0;j<i;j++){ 
248                                 if (dMatrix[i][j] < smallDist) {
249                                         mins.clear();
250                                         colDist temp(i, j, dMatrix[i][j]);
251                                         mins.push_back(temp); 
252                                         smallDist = dMatrix[i][j];
253                                 }else if (dMatrix[i][j] == smallDist) {
254                                         colDist temp(i, j, dMatrix[i][j]);
255                                         mins.push_back(temp); 
256
257                                 }
258                         }
259                 }
260                 
261                 if (mins.size() > 0) {
262                         int zrand = 0;
263                         if (mins.size() > 1) {
264                                 //pick random number between 0 and mins.size()
265                                 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
266                         }
267                         
268                         smallRow = mins[zrand].row;
269                         smallCol = mins[zrand].col;
270                 
271                 }
272         //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
273                 //eliminate smallCell
274                 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
275                 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
276                 
277                 return smallDist;
278                 
279         }
280         catch(exception& e) {
281                 m->errorOut(e, "ClusterClassic", "getSmallCell");
282                 exit(1);
283         }
284 }
285 /***********************************************************************/
286 void ClusterClassic::clusterBins(){
287         try {
288         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
289
290                 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));     
291                 rabund->set(smallCol, 0);       
292                 /*for (int i = smallCol+1; i < rabund->size(); i++) {
293                         rabund->set((i-1), rabund->get(i));
294                 }
295                 rabund->resize((rabund->size()-1));*/
296                 rabund->setLabel(toString(smallDist));
297
298         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
299         }
300         catch(exception& e) {
301                 m->errorOut(e, "ClusterClassic", "clusterBins");
302                 exit(1);
303         }
304 }
305 /***********************************************************************/
306 void ClusterClassic::clusterNames(){
307         try {
308         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
309                 if (mapWanted) {  updateMap();  }
310                 
311                 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
312                 list->set(smallCol, "");        
313                 /*for (int i = smallCol+1; i < list->size(); i++) {
314                         list->set((i-1), list->get(i));
315                 }
316                 list->resize((list->size()-1));*/
317                 list->setLabel(toString(smallDist));
318         
319         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
320     }
321         catch(exception& e) {
322                 m->errorOut(e, "ClusterClassic", "clusterNames");
323                 exit(1);
324         }
325 }
326 /***********************************************************************/
327 void ClusterClassic::update(double& cutOFF){
328         try {
329 //print();              
330                 getSmallCell();
331                 
332                 int r, c;
333                 r = smallRow; c = smallCol;
334                                 
335                 for(int i=0;i<nseqs;i++){
336                         if(i != r && i != c){
337                                 double distRow, distCol, newDist;
338                                 if (i > r) { distRow = dMatrix[i][r]; }
339                                 else { distRow =  dMatrix[r][i]; }
340
341                                 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
342                                 else { distCol =  dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
343                                 
344                                 if(method == "furthest"){
345                                         newDist = max(distRow, distCol);
346                                 }
347                                 else if (method == "average"){
348                                         int rowBin = rabund->get(r);
349                                         int colBin = rabund->get(c);
350                                         newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
351                                 }
352                                 else if (method == "weighted"){
353                                         newDist = (distCol + distRow) / 2.0;
354                                 }
355                                 else if (method == "nearest"){
356                                         newDist = min(distRow, distCol);
357                                 }
358                                 //cout << "newDist = " << newDist << endl;      
359                                 if (i > r) { dMatrix[i][r] = newDist; }
360                                 else { dMatrix[r][i] = newDist; }
361                                 
362                         }
363                 }
364                         
365                 clusterBins();
366                 clusterNames();
367                 
368                 //resize each row
369                 /*for(int i=0;i<nseqs;i++){
370                         for(int j=c+1;j<dMatrix[i].size();j++){
371                                 dMatrix[i][j-1]=dMatrix[i][j];
372                         }
373                 }                       
374                 
375                 //resize each col
376                 for(int i=c+1;i<nseqs;i++){
377                         for(int j=0;j < dMatrix[i-1].size();j++){
378                                 dMatrix[i-1][j]=dMatrix[i][j];
379                         }
380                 }       
381                 
382                 nseqs--;
383                 dMatrix.pop_back();*/
384
385         }
386         catch(exception& e) {
387                 m->errorOut(e, "ClusterClassic", "update");
388                 exit(1);
389         }
390 }
391 /***********************************************************************/
392 void ClusterClassic::setMapWanted(bool f)  {  
393         try {
394                 mapWanted = f;
395                 
396                 //initialize map
397                 for (int i = 0; i < list->getNumBins(); i++) {
398                         
399                         //parse bin 
400                         string names = list->get(i);
401                         while (names.find_first_of(',') != -1) { 
402                                 //get name from bin
403                                 string name = names.substr(0,names.find_first_of(','));
404                                 //save name and bin number
405                                 seq2Bin[name] = i;
406                                 names = names.substr(names.find_first_of(',')+1, names.length());
407                         }
408                         
409                         //get last name
410                         seq2Bin[names] = i;
411                 }
412                 
413         }
414         catch(exception& e) {
415                 m->errorOut(e, "ClusterClassic", "setMapWanted");
416                 exit(1);
417         }
418 }
419 /***********************************************************************/
420 void ClusterClassic::updateMap() {
421 try {
422                 //update location of seqs in smallRow since they move to smallCol now
423                 string names = list->get(smallRow);
424                 while (names.find_first_of(',') != -1) { 
425                         //get name from bin
426                         string name = names.substr(0,names.find_first_of(','));
427                         //save name and bin number
428                         seq2Bin[name] = smallCol;
429                         names = names.substr(names.find_first_of(',')+1, names.length());
430                 }
431                         
432                 //get last name
433                 seq2Bin[names] = smallCol;
434                 
435         }
436         catch(exception& e) {
437                 m->errorOut(e, "ClusterClassic", "updateMap");
438                 exit(1);
439         }
440 }
441 /***********************************************************************/
442 void ClusterClassic::print() {
443 try {
444                 //update location of seqs in smallRow since they move to smallCol now
445                 for (int i = 0; i < dMatrix.size(); i++) {
446                         m->mothurOut("row = " + toString(i) + "\t");
447                         for (int j = 0; j < dMatrix[i].size(); j++) {
448                                 m->mothurOut(toString(dMatrix[i][j]) + "\t");
449                         }
450                         m->mothurOutEndLine();
451                 }
452         }
453         catch(exception& e) {
454                 m->errorOut(e, "ClusterClassic", "updateMap");
455                 exit(1);
456         }
457 }
458 /***********************************************************************/
459