]> git.donarmstrong.com Git - mothur.git/blob - cluster.cpp
added warning about average neighbor merges around cutoff. fixed little bugs.
[mothur.git] / cluster.cpp
1 /*
2  *  cluster.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/14/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "cluster.hpp"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16
17 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
18 rabund(rav), list(lv), dMatrix(dm), cutoff(c)
19 {
20 /*
21         cout << "sizeof(MatData): " << sizeof(MatData) << endl;
22         cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
23
24         int nCells = dMatrix->getNNodes();
25         time_t start = time(NULL);
26
27         MatVec matvec = MatVec(nCells); 
28         int i = 0;
29         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
30                 matvec[i++] = currentCell;
31         }
32         for (i= matvec.size();i>0;i--) {
33                 dMatrix->rmCell(matvec[i-1]);
34         }
35         MatData it = dMatrix->begin(); 
36         while (it != dMatrix->end()) { 
37                 it = dMatrix->rmCell(it);
38         }
39         cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
40     exit(0);
41         MatData it = dMatrix->begin();
42         cout << it->row << "/" << it->column << "/" << it->dist << endl;
43         dMatrix->rmCell(dMatrix->begin());
44         cout << it->row << "/" << it->column << "/" << it->dist << endl;
45         exit(0);
46 */
47
48         // Create a data structure to quickly access the PCell information
49         // for a certain sequence. It consists of a vector of lists, where 
50         // a list contains pointers (iterators) to the all distances related
51         // to a certain sequence. The Vector is accessed via the index of a 
52         // sequence in the distance matrix.
53         seqVec = vector<MatVec>(lv->size());
54         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
55                 seqVec[currentCell->row].push_back(currentCell);
56                 seqVec[currentCell->column].push_back(currentCell);
57         }
58         mapWanted = false;  //set to true by mgcluster to speed up overlap merge
59 }
60
61 /***********************************************************************/
62
63 void Cluster::getRowColCells() {
64         try {
65                 PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
66         
67                 smallRow = smallCell->row;              // get its row
68                 smallCol = smallCell->column;   // get its column
69                 smallDist = smallCell->dist;    // get the smallest distance
70         
71                 rowCells = seqVec[smallRow];    // all distances related to the row index
72                 colCells = seqVec[smallCol];    // all distances related to the column index
73                 nRowCells = rowCells.size();
74                 nColCells = colCells.size();
75         }
76         catch(exception& e) {
77                 errorOut(e, "Cluster", "getRowColCells");
78                 exit(1);
79         }
80
81 }
82 /***********************************************************************/
83 // Remove the specified cell from the seqVec and from the sparse
84 // matrix
85 void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
86 {
87         ull drow = cell->row;
88         ull dcol = cell->column;
89         if (((vrow >=0) && (drow != smallRow)) ||
90                 ((vcol >=0) && (dcol != smallCol))) {
91                 ull dtemp = drow;
92                 drow = dcol;
93                 dcol = dtemp;
94         }
95
96         ull crow;
97         ull ccol;
98         int nCells;
99         if (vrow < 0) {
100                 nCells = seqVec[drow].size();
101                 for (vrow=0; vrow<nCells;vrow++) {
102                         crow = seqVec[drow][vrow]->row;
103                         ccol = seqVec[drow][vrow]->column;
104                         if (((crow == drow) && (ccol == dcol)) ||
105                                 ((ccol == drow) && (crow == dcol))) {
106                                 break;
107                         }
108                 }
109         }
110         seqVec[drow].erase(seqVec[drow].begin()+vrow);
111         if (vcol < 0) {
112                 nCells = seqVec[dcol].size();
113                 for (vcol=0; vcol<nCells;vcol++) {
114                         crow = seqVec[dcol][vcol]->row;
115                         ccol = seqVec[dcol][vcol]->column;
116                         if (((crow == drow) && (ccol == dcol)) ||
117                                 ((ccol == drow) && (crow == dcol))) {
118                                 break;
119                         }
120                 }
121         }
122         seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
123         if (rmMatrix) {
124                 dMatrix->rmCell(cell);
125         }
126 }
127
128
129 /***********************************************************************/
130
131 void Cluster::clusterBins(){
132         try {
133         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
134
135                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
136                 rabund->set(smallRow, 0);       
137                 rabund->setLabel(toString(smallDist));
138
139         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
140         }
141         catch(exception& e) {
142                 errorOut(e, "Cluster", "clusterBins");
143                 exit(1);
144         }
145
146
147 }
148
149 /***********************************************************************/
150
151 void Cluster::clusterNames(){
152         try {
153         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
154                 if (mapWanted) {  updateMap();  }
155                 
156                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
157                 list->set(smallRow, "");        
158                 list->setLabel(toString(smallDist));
159         
160         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
161     }
162         catch(exception& e) {
163                 errorOut(e, "Cluster", "clusterNames");
164                 exit(1);
165         }
166
167 }
168
169 /***********************************************************************/
170 //This function clusters based on the method of the derived class
171 //At the moment only average and complete linkage are covered, because
172 //single linkage uses a different approach.
173 void Cluster::update(){
174         try {
175                 getRowColCells();       
176         
177                 vector<int> found(nColCells, 0);
178                 int search;
179                 bool changed;
180
181                 // The vector has to be traversed in reverse order to preserve the index
182                 // for faster removal in removeCell()
183                 for (int i=nRowCells-1;i>=0;i--) {
184                         if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
185                                 if (rowCells[i]->row == smallRow) {
186                                         search = rowCells[i]->column;
187                                 } else {
188                                         search = rowCells[i]->row;
189                                 }
190                 
191                                 for (int j=0;j<nColCells;j++) {
192                                         if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
193                                                 if (colCells[j]->row == search || colCells[j]->column == search) {
194                                                         found[j] = 1;
195                                                         changed = updateDistance(colCells[j], rowCells[i]);
196                                                         // If the cell's distance changed and it had the same distance as 
197                                                         // the smallest distance, invalidate the mins vector in SparseMatrix
198                                                         if (changed) {
199                                                                 if (colCells[j]->vectorMap != NULL) {
200                                                                         *(colCells[j]->vectorMap) = NULL;
201                                                                         colCells[j]->vectorMap = NULL;
202                                                                 }
203                                                         }
204                                                         break;
205                                                 }
206                                         }
207                                 }
208                                 removeCell(rowCells[i], i , -1);
209                         }
210                 }
211                 clusterBins();
212                 clusterNames();
213
214                 // Special handling for singlelinkage case, not sure whether this
215                 // could be avoided
216                 for (int i=nColCells-1;i>=0;i--) {
217                         if (found[i] == 0) {
218                                 removeCell(colCells[i], -1, i);
219                         }
220                 }
221         }
222         catch(exception& e) {
223                 errorOut(e, "Cluster", "update");
224                 exit(1);
225         }
226 }
227 /***********************************************************************/
228 void Cluster::setMapWanted(bool m)  {  
229         try {
230                 mapWanted = m;
231                 
232                 //initialize map
233                 for (int i = 0; i < list->getNumBins(); i++) {
234                         
235                         //parse bin 
236                         string names = list->get(i);
237                         while (names.find_first_of(',') != -1) { 
238                                 //get name from bin
239                                 string name = names.substr(0,names.find_first_of(','));
240                                 //save name and bin number
241                                 seq2Bin[name] = i;
242                                 names = names.substr(names.find_first_of(',')+1, names.length());
243                         }
244                         
245                         //get last name
246                         seq2Bin[names] = i;
247                 }
248                 
249         }
250         catch(exception& e) {
251                 errorOut(e, "Cluster", "setMapWanted");
252                 exit(1);
253         }
254 }
255 /***********************************************************************/
256 void Cluster::updateMap() {
257 try {
258                 //update location of seqs in smallRow since they move to smallCol now
259                 string names = list->get(smallRow);
260                 while (names.find_first_of(',') != -1) { 
261                         //get name from bin
262                         string name = names.substr(0,names.find_first_of(','));
263                         //save name and bin number
264                         seq2Bin[name] = smallCol;
265                         names = names.substr(names.find_first_of(',')+1, names.length());
266                 }
267                         
268                 //get last name
269                 seq2Bin[names] = smallCol;
270                 
271         }
272         catch(exception& e) {
273                 errorOut(e, "Cluster", "updateMap");
274                 exit(1);
275         }
276 }
277 /***********************************************************************/
278
279
280