]> git.donarmstrong.com Git - mothur.git/blob - cluster.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / cluster.cpp
1 /*
2  *  cluster.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/14/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "cluster.hpp"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16
17 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) :
18 rabund(rav), list(lv), dMatrix(dm), method(f)
19 {
20         try {
21 /*
22         cout << "sizeof(MatData): " << sizeof(MatData) << endl;
23         cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
24
25         int nCells = dMatrix->getNNodes();
26         time_t start = time(NULL);
27
28         MatVec matvec = MatVec(nCells); 
29         int i = 0;
30         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
31                 matvec[i++] = currentCell;
32         }
33         for (i= matvec.size();i>0;i--) {
34                 dMatrix->rmCell(matvec[i-1]);
35         }
36         MatData it = dMatrix->begin(); 
37         while (it != dMatrix->end()) { 
38                 it = dMatrix->rmCell(it);
39         }
40         cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
41     exit(0);
42         MatData it = dMatrix->begin();
43         cout << it->row << "/" << it->column << "/" << it->dist << endl;
44         dMatrix->rmCell(dMatrix->begin());
45         cout << it->row << "/" << it->column << "/" << it->dist << endl;
46         exit(0);
47 */
48
49         // Create a data structure to quickly access the PCell information
50         // for a certain sequence. It consists of a vector of lists, where 
51         // a list contains pointers (iterators) to the all distances related
52         // to a certain sequence. The Vector is accessed via the index of a 
53         // sequence in the distance matrix.
54 //ofstream outtemp;
55 //string temp = "temp";
56 //m->openOutputFile(temp, outtemp);     
57 //cout << lv->size() << endl;
58         seqVec = vector<MatVec>(lv->size());
59         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
60 //outtemp << currentCell->row << '\t' << currentCell->column  << '\t' << currentCell->dist << endl;
61                 seqVec[currentCell->row].push_back(currentCell);
62                 seqVec[currentCell->column].push_back(currentCell);
63         }
64 //outtemp.close();
65         mapWanted = false;  //set to true by mgcluster to speed up overlap merge
66         
67         //save so you can modify as it changes in average neighbor
68         cutoff = c;
69         m = MothurOut::getInstance();
70         
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "Cluster", "Cluster");
74                 exit(1);
75         }
76 }
77
78 /***********************************************************************/
79
80 void Cluster::getRowColCells() {
81         try {
82                 PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
83         
84                 smallRow = smallCell->row;              // get its row
85                 smallCol = smallCell->column;   // get its column
86                 smallDist = smallCell->dist;    // get the smallest distance
87         //cout << "small row = " << smallRow << "small col = " << smallCol << "small dist = " << smallDist << endl;
88          
89                 rowCells = seqVec[smallRow];    // all distances related to the row index
90                 colCells = seqVec[smallCol];    // all distances related to the column index
91                 nRowCells = rowCells.size();
92                 nColCells = colCells.size();
93 //cout << "num rows = " << nRowCells << "num col = " << nColCells << endl;
94                 
95                 //for (int i = 0; i < nColCells; i++) { cout << colCells[i]->row << '\t' << colCells[i]->column << endl;  }
96                 //for (int i = 0; i < nRowCells; i++) { cout << rowCells[i]->row << '\t' << rowCells[i]->column << endl;  }
97         }
98         catch(exception& e) {
99                 m->errorOut(e, "Cluster", "getRowColCells");
100                 exit(1);
101         }
102
103 }
104 /***********************************************************************/
105 // Remove the specified cell from the seqVec and from the sparse
106 // matrix
107 void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){
108         try {
109         
110                 ull drow = cell->row;
111                         ull dcol = cell->column;
112                         if (((vrow >=0) && (drow != smallRow)) ||
113                                 ((vcol >=0) && (dcol != smallCol))) {
114                                 ull dtemp = drow;
115                                 drow = dcol;
116                                 dcol = dtemp;
117                         }
118
119                         ull crow;
120                         ull ccol;
121                         int nCells;
122                         if (vrow < 0) {
123                                 nCells = seqVec[drow].size();
124                                 for (vrow=0; vrow<nCells;vrow++) {
125                                         crow = seqVec[drow][vrow]->row;
126                                         ccol = seqVec[drow][vrow]->column;
127                                         if (((crow == drow) && (ccol == dcol)) ||
128                                                 ((ccol == drow) && (crow == dcol))) {
129                                                 break;
130                                         }
131                                 }
132                         }
133
134                         seqVec[drow].erase(seqVec[drow].begin()+vrow);
135                         if (vcol < 0) {
136                                 nCells = seqVec[dcol].size();
137                                 for (vcol=0; vcol<nCells;vcol++) {
138                                         crow = seqVec[dcol][vcol]->row;
139                                         ccol = seqVec[dcol][vcol]->column;
140                                         if (((crow == drow) && (ccol == dcol)) ||
141                                                 ((ccol == drow) && (crow == dcol))) {
142                                                 break;
143                                         }
144                                 }
145                         }
146                 
147                         seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
148                 
149                         if (rmMatrix) {
150                         //cout << " removing = " << cell->row << '\t' << cell->column  << '\t' << cell->dist << endl;
151                                 dMatrix->rmCell(cell);
152                 //      cout << "done" << endl;
153                         }
154                 
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "Cluster", "removeCell");
158                 exit(1);
159         }
160 }
161 /***********************************************************************/
162
163 void Cluster::clusterBins(){
164         try {
165         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
166
167                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
168                 rabund->set(smallRow, 0);       
169                 rabund->setLabel(toString(smallDist));
170
171         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
172         }
173         catch(exception& e) {
174                 m->errorOut(e, "Cluster", "clusterBins");
175                 exit(1);
176         }
177
178
179 }
180
181 /***********************************************************************/
182
183 void Cluster::clusterNames(){
184         try {
185         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
186                 if (mapWanted) {  updateMap();  }
187                 
188                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
189                 list->set(smallRow, "");        
190                 list->setLabel(toString(smallDist));
191         
192         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
193     }
194         catch(exception& e) {
195                 m->errorOut(e, "Cluster", "clusterNames");
196                 exit(1);
197         }
198
199 }
200
201 /***********************************************************************/
202 //This function clusters based on the method of the derived class
203 //At the moment only average and complete linkage are covered, because
204 //single linkage uses a different approach.
205 void Cluster::update(double& cutOFF){
206         try {
207                 getRowColCells();       
208
209                 vector<int> foundCol(nColCells, 0);
210
211                 int search;
212                 bool changed;
213
214                 // The vector has to be traversed in reverse order to preserve the index
215                 // for faster removal in removeCell()
216                 for (int i=nRowCells-1;i>=0;i--) {
217                         //if you are not the smallCell
218                         if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
219                                 if (rowCells[i]->row == smallRow) {
220                                         search = rowCells[i]->column;
221                                 } else {
222                                         search = rowCells[i]->row;
223                                 }
224                                 
225                                 bool merged = false;
226                                 for (int j=0;j<nColCells;j++) {
227                                         if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
228                                                 if (colCells[j]->row == search || colCells[j]->column == search) {
229                                                         foundCol[j] = 1;
230                                                         merged = true;
231                                                         changed = updateDistance(colCells[j], rowCells[i]);
232                                                         // If the cell's distance changed and it had the same distance as 
233                                                         // the smallest distance, invalidate the mins vector in SparseMatrix
234                                                         if (changed) {
235                                                                 if (colCells[j]->vectorMap != NULL) {
236                                                                         *(colCells[j]->vectorMap) = NULL;
237                                                                         colCells[j]->vectorMap = NULL;
238                                                                 }
239                                                         }
240                                                         break;
241                                                 }
242                                         }               
243                                 }
244                                 //if not merged it you need it for warning 
245                                 if ((!merged) && (method == "average" || method == "weighted")) {  
246                                         //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); 
247                                         if (cutOFF > rowCells[i]->dist) {  
248                                                 cutOFF = rowCells[i]->dist;  
249                                                 //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
250                                         }
251
252                                 }
253                                 removeCell(rowCells[i], i , -1);  
254                                 
255                         }
256                 }
257                 clusterBins();
258                 clusterNames();
259
260                 // Special handling for singlelinkage case, not sure whether this
261                 // could be avoided
262                 for (int i=nColCells-1;i>=0;i--) {
263                         if (foundCol[i] == 0) {
264                                 if (method == "average" || method == "weighted") {
265                                         if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
266                                                 //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine();
267                                                 if (cutOFF > colCells[i]->dist) {  
268                                                         cutOFF = colCells[i]->dist;  
269                                                         //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
270                                                 }
271                                         }
272                                 }
273                                 removeCell(colCells[i], -1, i);
274                         }
275                 }
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "Cluster", "update");
279                 exit(1);
280         }
281 }
282 /***********************************************************************/
283 void Cluster::setMapWanted(bool f)  {  
284         try {
285                 mapWanted = f;
286                 
287                 //initialize map
288                 for (int i = 0; i < list->getNumBins(); i++) {
289                         
290                         //parse bin 
291                         string names = list->get(i);
292                         while (names.find_first_of(',') != -1) { 
293                                 //get name from bin
294                                 string name = names.substr(0,names.find_first_of(','));
295                                 //save name and bin number
296                                 seq2Bin[name] = i;
297                                 names = names.substr(names.find_first_of(',')+1, names.length());
298                         }
299                         
300                         //get last name
301                         seq2Bin[names] = i;
302                 }
303                 
304         }
305         catch(exception& e) {
306                 m->errorOut(e, "Cluster", "setMapWanted");
307                 exit(1);
308         }
309 }
310 /***********************************************************************/
311 void Cluster::updateMap() {
312 try {
313                 //update location of seqs in smallRow since they move to smallCol now
314                 string names = list->get(smallRow);
315                 while (names.find_first_of(',') != -1) { 
316                         //get name from bin
317                         string name = names.substr(0,names.find_first_of(','));
318                         //save name and bin number
319                         seq2Bin[name] = smallCol;
320                         names = names.substr(names.find_first_of(',')+1, names.length());
321                 }
322                         
323                 //get last name
324                 seq2Bin[names] = smallCol;
325                 
326         }
327         catch(exception& e) {
328                 m->errorOut(e, "Cluster", "updateMap");
329                 exit(1);
330         }
331 }
332 /***********************************************************************/
333
334
335