]> git.donarmstrong.com Git - mothur.git/blob - cluster.cpp
some changes while testing 1.9
[mothur.git] / cluster.cpp
1 /*
2  *  cluster.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/14/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "cluster.hpp"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16
17 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) :
18 rabund(rav), list(lv), dMatrix(dm), method(f)
19 {
20 /*
21         cout << "sizeof(MatData): " << sizeof(MatData) << endl;
22         cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
23
24         int nCells = dMatrix->getNNodes();
25         time_t start = time(NULL);
26
27         MatVec matvec = MatVec(nCells); 
28         int i = 0;
29         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
30                 matvec[i++] = currentCell;
31         }
32         for (i= matvec.size();i>0;i--) {
33                 dMatrix->rmCell(matvec[i-1]);
34         }
35         MatData it = dMatrix->begin(); 
36         while (it != dMatrix->end()) { 
37                 it = dMatrix->rmCell(it);
38         }
39         cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
40     exit(0);
41         MatData it = dMatrix->begin();
42         cout << it->row << "/" << it->column << "/" << it->dist << endl;
43         dMatrix->rmCell(dMatrix->begin());
44         cout << it->row << "/" << it->column << "/" << it->dist << endl;
45         exit(0);
46 */
47
48         // Create a data structure to quickly access the PCell information
49         // for a certain sequence. It consists of a vector of lists, where 
50         // a list contains pointers (iterators) to the all distances related
51         // to a certain sequence. The Vector is accessed via the index of a 
52         // sequence in the distance matrix.
53 //ofstream outtemp;
54 //string temp = "temp";
55 //openOutputFile(temp, outtemp);        
56 //cout << lv->size() << endl;
57         seqVec = vector<MatVec>(lv->size());
58         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
59 //outtemp << currentCell->row << '\t' << currentCell->column  << '\t' << currentCell->dist << endl;
60                 seqVec[currentCell->row].push_back(currentCell);
61                 seqVec[currentCell->column].push_back(currentCell);
62         }
63 //outtemp.close();
64         mapWanted = false;  //set to true by mgcluster to speed up overlap merge
65         
66         //save so you can modify as it changes in average neighbor
67         cutoff = c;
68         m = MothurOut::getInstance();
69 }
70
71 /***********************************************************************/
72
73 void Cluster::getRowColCells() {
74         try {
75                 PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
76         
77                 smallRow = smallCell->row;              // get its row
78                 smallCol = smallCell->column;   // get its column
79                 smallDist = smallCell->dist;    // get the smallest distance
80         //cout << "small row = " << smallRow << "small col = " << smallCol << "small dist = " << smallDist << endl;
81          
82                 rowCells = seqVec[smallRow];    // all distances related to the row index
83                 colCells = seqVec[smallCol];    // all distances related to the column index
84                 nRowCells = rowCells.size();
85                 nColCells = colCells.size();
86 //cout << "num rows = " << nRowCells << "num col = " << nColCells << endl;
87                 
88                 //for (int i = 0; i < nColCells; i++) { cout << colCells[i]->row << '\t' << colCells[i]->column << endl;  }
89                 //for (int i = 0; i < nRowCells; i++) { cout << rowCells[i]->row << '\t' << rowCells[i]->column << endl;  }
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "Cluster", "getRowColCells");
93                 exit(1);
94         }
95
96 }
97 /***********************************************************************/
98 // Remove the specified cell from the seqVec and from the sparse
99 // matrix
100 void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){
101         try {
102         
103                 ull drow = cell->row;
104                         ull dcol = cell->column;
105                         if (((vrow >=0) && (drow != smallRow)) ||
106                                 ((vcol >=0) && (dcol != smallCol))) {
107                                 ull dtemp = drow;
108                                 drow = dcol;
109                                 dcol = dtemp;
110                         }
111
112                         ull crow;
113                         ull ccol;
114                         int nCells;
115                         if (vrow < 0) {
116                                 nCells = seqVec[drow].size();
117                                 for (vrow=0; vrow<nCells;vrow++) {
118                                         crow = seqVec[drow][vrow]->row;
119                                         ccol = seqVec[drow][vrow]->column;
120                                         if (((crow == drow) && (ccol == dcol)) ||
121                                                 ((ccol == drow) && (crow == dcol))) {
122                                                 break;
123                                         }
124                                 }
125                         }
126
127                         seqVec[drow].erase(seqVec[drow].begin()+vrow);
128                         if (vcol < 0) {
129                                 nCells = seqVec[dcol].size();
130                                 for (vcol=0; vcol<nCells;vcol++) {
131                                         crow = seqVec[dcol][vcol]->row;
132                                         ccol = seqVec[dcol][vcol]->column;
133                                         if (((crow == drow) && (ccol == dcol)) ||
134                                                 ((ccol == drow) && (crow == dcol))) {
135                                                 break;
136                                         }
137                                 }
138                         }
139                 
140                         seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
141                 
142                         if (rmMatrix) {
143                         //cout << " removing = " << cell->row << '\t' << cell->column  << '\t' << cell->dist << endl;
144                                 dMatrix->rmCell(cell);
145                 //      cout << "done" << endl;
146                         }
147                 
148         }
149         catch(exception& e) {
150                 m->errorOut(e, "Cluster", "removeCell");
151                 exit(1);
152         }
153 }
154 /***********************************************************************/
155
156 void Cluster::clusterBins(){
157         try {
158         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
159
160                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
161                 rabund->set(smallRow, 0);       
162                 rabund->setLabel(toString(smallDist));
163
164         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
165         }
166         catch(exception& e) {
167                 m->errorOut(e, "Cluster", "clusterBins");
168                 exit(1);
169         }
170
171
172 }
173
174 /***********************************************************************/
175
176 void Cluster::clusterNames(){
177         try {
178         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
179                 if (mapWanted) {  updateMap();  }
180                 
181                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
182                 list->set(smallRow, "");        
183                 list->setLabel(toString(smallDist));
184         
185         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
186     }
187         catch(exception& e) {
188                 m->errorOut(e, "Cluster", "clusterNames");
189                 exit(1);
190         }
191
192 }
193
194 /***********************************************************************/
195 //This function clusters based on the method of the derived class
196 //At the moment only average and complete linkage are covered, because
197 //single linkage uses a different approach.
198 void Cluster::update(double& cutOFF){
199         try {
200                 getRowColCells();       
201 //cout << "got rowcells" << endl;
202
203                 vector<int> foundCol(nColCells, 0);
204
205                 int search;
206                 bool changed;
207
208                 // The vector has to be traversed in reverse order to preserve the index
209                 // for faster removal in removeCell()
210                 for (int i=nRowCells-1;i>=0;i--) {
211                         if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
212                                 if (rowCells[i]->row == smallRow) {
213                                         search = rowCells[i]->column;
214                                 } else {
215                                         search = rowCells[i]->row;
216                                 }
217                                 
218                                 bool merged = false;
219                                 for (int j=0;j<nColCells;j++) {
220                                         if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
221                                                 if (colCells[j]->row == search || colCells[j]->column == search) {
222                                                         foundCol[j] = 1;
223                                                         merged = true;
224                                                         changed = updateDistance(colCells[j], rowCells[i]);
225                                                         // If the cell's distance changed and it had the same distance as 
226                                                         // the smallest distance, invalidate the mins vector in SparseMatrix
227                                                         if (changed) {
228                                                                 if (colCells[j]->vectorMap != NULL) {
229                                                                         *(colCells[j]->vectorMap) = NULL;
230                                                                         colCells[j]->vectorMap = NULL;
231                                                                 }
232                                                         }
233                                                         break;
234                                                 }
235                                         }               
236                                 }
237                                 //if not merged it you need it for warning 
238                                 if ((!merged) && (method == "average")) {  
239                                         //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); 
240                                         if (cutOFF > rowCells[i]->dist) {  
241                                                 cutOFF = rowCells[i]->dist;  
242                                                 //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
243                                         }
244
245                                 }
246                                 removeCell(rowCells[i], i , -1);  
247                                 
248                         }
249                 }
250                 clusterBins();
251                 clusterNames();
252
253                 // Special handling for singlelinkage case, not sure whether this
254                 // could be avoided
255                 for (int i=nColCells-1;i>=0;i--) {
256                         if (foundCol[i] == 0) {
257                                 if (method == "average") {
258                                         if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
259                                                 //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine();
260                                                 if (cutOFF > colCells[i]->dist) {  
261                                                         cutOFF = colCells[i]->dist;  
262                                                         //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
263                                                 }
264                                         }
265                                 }
266                                 removeCell(colCells[i], -1, i);
267                         }
268                 }
269         }
270         catch(exception& e) {
271                 m->errorOut(e, "Cluster", "update");
272                 exit(1);
273         }
274 }
275 /***********************************************************************/
276 void Cluster::setMapWanted(bool f)  {  
277         try {
278                 mapWanted = f;
279                 
280                 //initialize map
281                 for (int i = 0; i < list->getNumBins(); i++) {
282                         
283                         //parse bin 
284                         string names = list->get(i);
285                         while (names.find_first_of(',') != -1) { 
286                                 //get name from bin
287                                 string name = names.substr(0,names.find_first_of(','));
288                                 //save name and bin number
289                                 seq2Bin[name] = i;
290                                 names = names.substr(names.find_first_of(',')+1, names.length());
291                         }
292                         
293                         //get last name
294                         seq2Bin[names] = i;
295                 }
296                 
297         }
298         catch(exception& e) {
299                 m->errorOut(e, "Cluster", "setMapWanted");
300                 exit(1);
301         }
302 }
303 /***********************************************************************/
304 void Cluster::updateMap() {
305 try {
306                 //update location of seqs in smallRow since they move to smallCol now
307                 string names = list->get(smallRow);
308                 while (names.find_first_of(',') != -1) { 
309                         //get name from bin
310                         string name = names.substr(0,names.find_first_of(','));
311                         //save name and bin number
312                         seq2Bin[name] = smallCol;
313                         names = names.substr(names.find_first_of(',')+1, names.length());
314                 }
315                         
316                 //get last name
317                 seq2Bin[names] = smallCol;
318                 
319         }
320         catch(exception& e) {
321                 m->errorOut(e, "Cluster", "updateMap");
322                 exit(1);
323         }
324 }
325 /***********************************************************************/
326
327
328