]> git.donarmstrong.com Git - mothur.git/blob - clusterclassic.cpp
added classic parameter to cluster.split. working on make.contigs command. fixed...
[mothur.git] / clusterclassic.cpp
1 /*
2  *  clusterclassic.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/29/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterclassic.h"
11 #include "progress.hpp"
12
13 /***********************************************************************/
14 ClusterClassic::ClusterClassic(float c, string f, bool s) : method(f), smallDist(1e6), nseqs(0), sim(s) {
15         try {
16                 mapWanted = false;  //set to true by mgcluster to speed up overlap merge
17         
18                 //save so you can modify as it changes in average neighbor
19                 cutoff = c;
20                 aboveCutoff = cutoff + 10000.0;
21                 m = MothurOut::getInstance();
22         if(method == "furthest")        {   tag = "fn";   }
23         else if (method == "average")   {   tag = "an";   }
24         else if (method == "weighted")  {   tag = "wn";   }        
25         else if (method == "nearest")   {   tag = "nn";   }
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "ClusterClassic", "ClusterClassic");
29                 exit(1);
30         }
31 }
32 /***********************************************************************/
33 int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
34         try {
35                 double distance;
36                 int square;
37                 string name;
38                 vector<string> matrixNames;
39                 
40                 ifstream fileHandle;
41                 m->openInputFile(filename, fileHandle);
42                 
43                 fileHandle >> nseqs >> name;
44
45                 matrixNames.push_back(name);
46
47                 if(nameMap == NULL){
48                                 list = new ListVector(nseqs);
49                                 list->set(0, name);
50                 }
51                 else{
52                                 list = new ListVector(nameMap->getListVector());
53                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
54                 }
55                 
56                 //initialize distance matrix to cutoff
57                 dMatrix.resize(nseqs);
58                 //colDist temp(0, 0, aboveCutoff);
59                 //rowSmallDists.resize(nseqs, temp);
60                 for (int i = 1; i < nseqs; i++) {                       
61                         dMatrix[i].resize(i, aboveCutoff);              
62                 }                                                                                               
63                                                                                                                 
64                 
65                 char d;
66                 while((d=fileHandle.get()) != EOF){
67
68                                 if(isalnum(d)){
69                                                 square = 1;
70                                                 fileHandle.putback(d);
71                                                 for(int i=0;i<nseqs;i++){
72                                                                 fileHandle >> distance;
73                                                 }
74                                                 break;
75                                 }
76                                 if(d == '\n'){
77                                                 square = 0;
78                                                 break;
79                                 }
80                 }
81
82                 Progress* reading;
83
84                 if(square == 0){
85
86                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
87
88                                 int        index = 0;
89
90                                 for(int i=1;i<nseqs;i++){
91                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
92                                                 
93                                                 fileHandle >> name;
94                                                 matrixNames.push_back(name);
95                 
96
97                                                 //there's A LOT of repeated code throughout this method...
98                                                 if(nameMap == NULL){
99                                                                 list->set(i, name);
100                                                 
101                                                                 for(int j=0;j<i;j++){
102                                                                 
103                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
104                                                                                 
105                                                                                 fileHandle >> distance;
106                                                         
107                                                                                 if (distance == -1) { distance = 1000000; }
108                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
109                                                                 
110                                                                                 //if(distance < cutoff){
111                                                                                         dMatrix[i][j] = distance;
112                                                                                         if (distance < smallDist) { smallDist = distance; }
113                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
114                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
115                                                                                 //}
116                                                                                 index++;
117                                                                                 reading->update(index);
118                                                                 }
119                                 
120                                                 }
121                                                 else{
122                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
123                                 
124                                                                 for(int j=0;j<i;j++){
125                                                                                 fileHandle >> distance;
126                                                                                 
127                                                                                 if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
128                                 
129                                                                                 if (distance == -1) { distance = 1000000; }
130                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
131                                                                                 
132                                                                                 //if(distance < cutoff){
133                                                                                         if (distance < smallDist) { smallDist = distance; }
134                                                                                         
135                                                                                         int row = nameMap->get(matrixNames[i]);
136                                                                                         int col = nameMap->get(matrixNames[j]);
137                                                                                         
138                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
139                                                                                         else { dMatrix[row][col] = distance; }
140                                                                                         
141                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
142                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
143                                                                                 //}
144                                                                                 index++;
145                                                                                 reading->update(index);
146                                                                 }
147                                                 }
148                                 }
149                 }
150                 else{
151
152                                 reading = new Progress("Reading matrix:     ", nseqs * nseqs);
153                 
154                                 int index = nseqs;
155
156                                 for(int i=1;i<nseqs;i++){
157                                                 fileHandle >> name;                
158                                                 matrixNames.push_back(name);
159                                                 
160                                                 if(nameMap == NULL){
161                                                                 list->set(i, name);
162                                                                 for(int j=0;j<nseqs;j++){
163                                                                                 fileHandle >> distance;
164                                                                                 
165                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
166                                                                                 
167                                                                                 if (distance == -1) { distance = 1000000; }
168                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
169                                                                                 
170                                                                                 if(j < i){
171                                                                                         if (distance < smallDist) { smallDist = distance; }
172                                                                                         
173                                                                                         dMatrix[i][j] = distance;
174                                                                                         //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
175                                                                                         //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
176                                                                                 }
177                                                                                 index++;
178                                                                                 reading->update(index);
179                                                                 }
180                                                 
181                                                 }
182                                                 else{
183                                                                 if(nameMap->count(name)==0){        m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
184                                 
185                                                                 for(int j=0;j<nseqs;j++){
186                                                                                 fileHandle >> distance;
187                                                                                 
188                                                                                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
189                                                                                 
190                                                                            if (distance == -1) { distance = 1000000; }
191                                                                                 else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
192                                                                                 
193                                                                                 if(j < i){
194                                                                                         if (distance < smallDist) { smallDist = distance; }
195                                                                                         
196                                                                                         int row = nameMap->get(matrixNames[i]);
197                                                                                         int col = nameMap->get(matrixNames[j]);
198                                                                                         
199                                                                                         if (row < col) {  dMatrix[col][row] = distance; }
200                                                                                         else { dMatrix[row][col] = distance; }
201                                                                                         
202                                                                                         //if (rowSmallDists[row].dist > distance) {  rowSmallDists[row].dist = distance; rowSmallDists[row].col = col; rowSmallDists[row].row = row; }
203                                                                                         //if (rowSmallDists[col].dist > distance) {  rowSmallDists[col].dist = distance; rowSmallDists[col].col = row; rowSmallDists[col].row = col; }
204                                                                                 }
205                                                                                 index++;
206                                                                                 reading->update(index);
207                                                                 }
208                                                 }
209                                 }
210                 }
211                 
212                 if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
213                 
214                 reading->finish();
215                 delete reading;
216
217                 list->setLabel("0");
218                 rabund = new RAbundVector(list->getRAbundVector());
219                 
220                 fileHandle.close();
221                 
222                 return 0;
223         }
224         catch(exception& e) {
225                 m->errorOut(e, "ClusterClassic", "readPhylipFile");
226                 exit(1);
227         }
228
229 }
230 /***********************************************************************/
231 //sets smallCol and smallRow, returns distance
232 double ClusterClassic::getSmallCell() {
233         try {
234                         
235                 smallDist = aboveCutoff;
236                 smallRow = 1;
237                 smallCol = 0;
238                 
239                 vector<colDist> mins;
240                 
241                 for(int i=1;i<nseqs;i++){
242                         for(int j=0;j<i;j++){ 
243                                 if (dMatrix[i][j] < smallDist) {
244                                         mins.clear();
245                                         colDist temp(i, j, dMatrix[i][j]);
246                                         mins.push_back(temp); 
247                                         smallDist = dMatrix[i][j];
248                                 }else if (dMatrix[i][j] == smallDist) {
249                                         colDist temp(i, j, dMatrix[i][j]);
250                                         mins.push_back(temp); 
251
252                                 }
253                         }
254                 }
255                 
256                 if (mins.size() > 0) {
257                         int zrand = 0;
258                         if (mins.size() > 1) {
259                                 //pick random number between 0 and mins.size()
260                                 zrand = (int)((float)(rand()) / (RAND_MAX / (mins.size()-1) + 1));
261                         }
262                         
263                         smallRow = mins[zrand].row;
264                         smallCol = mins[zrand].col;
265                 
266                 }
267         //cout << smallRow << '\t' << smallCol << '\t' << smallDist << endl;
268                 //eliminate smallCell
269                 if (smallRow < smallCol) { dMatrix[smallCol][smallRow] = aboveCutoff; }
270                 else { dMatrix[smallRow][smallCol] = aboveCutoff; }
271                 
272                 return smallDist;
273                 
274         }
275         catch(exception& e) {
276                 m->errorOut(e, "ClusterClassic", "getSmallCell");
277                 exit(1);
278         }
279 }
280 /***********************************************************************/
281 void ClusterClassic::clusterBins(){
282         try {
283         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
284
285                 rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol));     
286                 rabund->set(smallCol, 0);       
287                 /*for (int i = smallCol+1; i < rabund->size(); i++) {
288                         rabund->set((i-1), rabund->get(i));
289                 }
290                 rabund->resize((rabund->size()-1));*/
291                 rabund->setLabel(toString(smallDist));
292
293         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
294         }
295         catch(exception& e) {
296                 m->errorOut(e, "ClusterClassic", "clusterBins");
297                 exit(1);
298         }
299 }
300 /***********************************************************************/
301 void ClusterClassic::clusterNames(){
302         try {
303         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
304                 if (mapWanted) {  updateMap();  }
305                 
306                 list->set(smallRow, list->get(smallRow)+','+list->get(smallCol));
307                 list->set(smallCol, "");        
308                 /*for (int i = smallCol+1; i < list->size(); i++) {
309                         list->set((i-1), list->get(i));
310                 }
311                 list->resize((list->size()-1));*/
312                 list->setLabel(toString(smallDist));
313         
314         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
315     }
316         catch(exception& e) {
317                 m->errorOut(e, "ClusterClassic", "clusterNames");
318                 exit(1);
319         }
320 }
321 /***********************************************************************/
322 void ClusterClassic::update(double& cutOFF){
323         try {
324 //print();              
325                 getSmallCell();
326                 
327                 int r, c;
328                 r = smallRow; c = smallCol;
329                                 
330                 for(int i=0;i<nseqs;i++){
331                         if(i != r && i != c){
332                                 double distRow, distCol, newDist;
333                                 if (i > r) { distRow = dMatrix[i][r]; }
334                                 else { distRow =  dMatrix[r][i]; }
335
336                                 if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell
337                                 else { distCol =  dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; }
338                                 
339                                 if(method == "furthest"){
340                                         newDist = max(distRow, distCol);
341                                 }
342                                 else if (method == "average"){
343                                         int rowBin = rabund->get(r);
344                                         int colBin = rabund->get(c);
345                                         newDist = (colBin * distCol + rowBin * distRow) / (rowBin + colBin);
346                                 }
347                                 else if (method == "weighted"){
348                                         newDist = (distCol + distRow) / 2.0;
349                                 }
350                                 else if (method == "nearest"){
351                                         newDist = min(distRow, distCol);
352                                 }
353                                 //cout << "newDist = " << newDist << endl;      
354                                 if (i > r) { dMatrix[i][r] = newDist; }
355                                 else { dMatrix[r][i] = newDist; }
356                                 
357                         }
358                 }
359                         
360                 clusterBins();
361                 clusterNames();
362                 
363                 //resize each row
364                 /*for(int i=0;i<nseqs;i++){
365                         for(int j=c+1;j<dMatrix[i].size();j++){
366                                 dMatrix[i][j-1]=dMatrix[i][j];
367                         }
368                 }                       
369                 
370                 //resize each col
371                 for(int i=c+1;i<nseqs;i++){
372                         for(int j=0;j < dMatrix[i-1].size();j++){
373                                 dMatrix[i-1][j]=dMatrix[i][j];
374                         }
375                 }       
376                 
377                 nseqs--;
378                 dMatrix.pop_back();*/
379
380         }
381         catch(exception& e) {
382                 m->errorOut(e, "ClusterClassic", "update");
383                 exit(1);
384         }
385 }
386 /***********************************************************************/
387 void ClusterClassic::setMapWanted(bool f)  {  
388         try {
389                 mapWanted = f;
390                 
391                 //initialize map
392                 for (int i = 0; i < list->getNumBins(); i++) {
393                         
394                         //parse bin 
395                         string names = list->get(i);
396                         while (names.find_first_of(',') != -1) { 
397                                 //get name from bin
398                                 string name = names.substr(0,names.find_first_of(','));
399                                 //save name and bin number
400                                 seq2Bin[name] = i;
401                                 names = names.substr(names.find_first_of(',')+1, names.length());
402                         }
403                         
404                         //get last name
405                         seq2Bin[names] = i;
406                 }
407                 
408         }
409         catch(exception& e) {
410                 m->errorOut(e, "ClusterClassic", "setMapWanted");
411                 exit(1);
412         }
413 }
414 /***********************************************************************/
415 void ClusterClassic::updateMap() {
416 try {
417                 //update location of seqs in smallRow since they move to smallCol now
418                 string names = list->get(smallRow);
419                 while (names.find_first_of(',') != -1) { 
420                         //get name from bin
421                         string name = names.substr(0,names.find_first_of(','));
422                         //save name and bin number
423                         seq2Bin[name] = smallCol;
424                         names = names.substr(names.find_first_of(',')+1, names.length());
425                 }
426                         
427                 //get last name
428                 seq2Bin[names] = smallCol;
429                 
430         }
431         catch(exception& e) {
432                 m->errorOut(e, "ClusterClassic", "updateMap");
433                 exit(1);
434         }
435 }
436 /***********************************************************************/
437 void ClusterClassic::print() {
438 try {
439                 //update location of seqs in smallRow since they move to smallCol now
440                 for (int i = 0; i < dMatrix.size(); i++) {
441                         m->mothurOut("row = " + toString(i) + "\t");
442                         for (int j = 0; j < dMatrix[i].size(); j++) {
443                                 m->mothurOut(toString(dMatrix[i][j]) + "\t");
444                         }
445                         m->mothurOutEndLine();
446                 }
447         }
448         catch(exception& e) {
449                 m->errorOut(e, "ClusterClassic", "updateMap");
450                 exit(1);
451         }
452 }
453 /***********************************************************************/
454