]> git.donarmstrong.com Git - mothur.git/blob - cluster.cpp
added warning about merging with something above cutoff to cluster. working on chimeras
[mothur.git] / cluster.cpp
1 /*
2  *  cluster.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/14/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "cluster.hpp"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16
17 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
18 rabund(rav), list(lv), dMatrix(dm)
19 {
20 /*
21         cout << "sizeof(MatData): " << sizeof(MatData) << endl;
22         cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
23
24         int nCells = dMatrix->getNNodes();
25         time_t start = time(NULL);
26
27         MatVec matvec = MatVec(nCells); 
28         int i = 0;
29         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
30                 matvec[i++] = currentCell;
31         }
32         for (i= matvec.size();i>0;i--) {
33                 dMatrix->rmCell(matvec[i-1]);
34         }
35         MatData it = dMatrix->begin(); 
36         while (it != dMatrix->end()) { 
37                 it = dMatrix->rmCell(it);
38         }
39         cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
40     exit(0);
41         MatData it = dMatrix->begin();
42         cout << it->row << "/" << it->column << "/" << it->dist << endl;
43         dMatrix->rmCell(dMatrix->begin());
44         cout << it->row << "/" << it->column << "/" << it->dist << endl;
45         exit(0);
46 */
47
48         // Create a data structure to quickly access the PCell information
49         // for a certain sequence. It consists of a vector of lists, where 
50         // a list contains pointers (iterators) to the all distances related
51         // to a certain sequence. The Vector is accessed via the index of a 
52         // sequence in the distance matrix.
53         seqVec = vector<MatVec>(lv->size());
54         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
55                 seqVec[currentCell->row].push_back(currentCell);
56                 seqVec[currentCell->column].push_back(currentCell);
57         }
58         mapWanted = false;  //set to true by mgcluster to speed up overlap merge
59         
60         //save so you can modify as it changes in average neighbor
61         cutoff = c;
62 }
63
64 /***********************************************************************/
65
66 void Cluster::getRowColCells() {
67         try {
68                 PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
69         
70                 smallRow = smallCell->row;              // get its row
71                 smallCol = smallCell->column;   // get its column
72                 smallDist = smallCell->dist;    // get the smallest distance
73         
74                 rowCells = seqVec[smallRow];    // all distances related to the row index
75                 colCells = seqVec[smallCol];    // all distances related to the column index
76                 nRowCells = rowCells.size();
77                 nColCells = colCells.size();
78         }
79         catch(exception& e) {
80                 errorOut(e, "Cluster", "getRowColCells");
81                 exit(1);
82         }
83
84 }
85 /***********************************************************************/
86 // Remove the specified cell from the seqVec and from the sparse
87 // matrix
88 void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
89 {
90         ull drow = cell->row;
91         ull dcol = cell->column;
92         if (((vrow >=0) && (drow != smallRow)) ||
93                 ((vcol >=0) && (dcol != smallCol))) {
94                 ull dtemp = drow;
95                 drow = dcol;
96                 dcol = dtemp;
97         }
98
99         ull crow;
100         ull ccol;
101         int nCells;
102         if (vrow < 0) {
103                 nCells = seqVec[drow].size();
104                 for (vrow=0; vrow<nCells;vrow++) {
105                         crow = seqVec[drow][vrow]->row;
106                         ccol = seqVec[drow][vrow]->column;
107                         if (((crow == drow) && (ccol == dcol)) ||
108                                 ((ccol == drow) && (crow == dcol))) {
109                                 break;
110                         }
111                 }
112         }
113         seqVec[drow].erase(seqVec[drow].begin()+vrow);
114         if (vcol < 0) {
115                 nCells = seqVec[dcol].size();
116                 for (vcol=0; vcol<nCells;vcol++) {
117                         crow = seqVec[dcol][vcol]->row;
118                         ccol = seqVec[dcol][vcol]->column;
119                         if (((crow == drow) && (ccol == dcol)) ||
120                                 ((ccol == drow) && (crow == dcol))) {
121                                 break;
122                         }
123                 }
124         }
125         seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
126         if (rmMatrix) {
127                 dMatrix->rmCell(cell);
128         }
129 }
130
131
132 /***********************************************************************/
133
134 void Cluster::clusterBins(){
135         try {
136         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
137
138                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
139                 rabund->set(smallRow, 0);       
140                 rabund->setLabel(toString(smallDist));
141
142         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
143         }
144         catch(exception& e) {
145                 errorOut(e, "Cluster", "clusterBins");
146                 exit(1);
147         }
148
149
150 }
151
152 /***********************************************************************/
153
154 void Cluster::clusterNames(){
155         try {
156         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
157                 if (mapWanted) {  updateMap();  }
158                 
159                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
160                 list->set(smallRow, "");        
161                 list->setLabel(toString(smallDist));
162         
163         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
164     }
165         catch(exception& e) {
166                 errorOut(e, "Cluster", "clusterNames");
167                 exit(1);
168         }
169
170 }
171
172 /***********************************************************************/
173 //This function clusters based on the method of the derived class
174 //At the moment only average and complete linkage are covered, because
175 //single linkage uses a different approach.
176 void Cluster::update(double& cutOFF){
177         try {
178                 getRowColCells();       
179         
180                 vector<int> foundCol(nColCells, 0);
181
182                 int search;
183                 bool changed;
184
185                 // The vector has to be traversed in reverse order to preserve the index
186                 // for faster removal in removeCell()
187                 for (int i=nRowCells-1;i>=0;i--) {
188                         if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
189                                 if (rowCells[i]->row == smallRow) {
190                                         search = rowCells[i]->column;
191                                 } else {
192                                         search = rowCells[i]->row;
193                                 }
194                                 
195                                 bool merged = false;
196                                 for (int j=0;j<nColCells;j++) {
197                                         if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
198                                                 if (colCells[j]->row == search || colCells[j]->column == search) {
199                                                         foundCol[j] = 1;
200                                                         merged = true;
201                                                         changed = updateDistance(colCells[j], rowCells[i]);
202                                                         // If the cell's distance changed and it had the same distance as 
203                                                         // the smallest distance, invalidate the mins vector in SparseMatrix
204                                                         if (changed) {
205                                                                 if (colCells[j]->vectorMap != NULL) {
206                                                                         *(colCells[j]->vectorMap) = NULL;
207                                                                         colCells[j]->vectorMap = NULL;
208                                                                 }
209                                                         }
210                                                         break;
211                                                 }
212                                         }               
213                                 }
214                                 //if not merged it you need it for warning 
215                                 if (!merged) {  
216                                         mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); 
217                                         if (cutOFF > rowCells[i]->dist) {  cutOFF = rowCells[i]->dist;  mothurOut("changing cutoff to " + toString(cutOFF));  mothurOutEndLine(); }
218
219                                 }
220                                 removeCell(rowCells[i], i , -1);  
221                                 
222                         }
223                 }
224                 clusterBins();
225                 clusterNames();
226
227                 // Special handling for singlelinkage case, not sure whether this
228                 // could be avoided
229                 for (int i=nColCells-1;i>=0;i--) {
230                         if (foundCol[i] == 0) {
231                                 if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
232                                         mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine();
233                                         if (cutOFF > colCells[i]->dist) {  cutOFF = colCells[i]->dist;  mothurOut("changing cutoff to " + toString(cutOFF));  mothurOutEndLine(); }
234                                 }
235                                 removeCell(colCells[i], -1, i);
236                         }
237                 }
238         }
239         catch(exception& e) {
240                 errorOut(e, "Cluster", "update");
241                 exit(1);
242         }
243 }
244 /***********************************************************************/
245 void Cluster::setMapWanted(bool m)  {  
246         try {
247                 mapWanted = m;
248                 
249                 //initialize map
250                 for (int i = 0; i < list->getNumBins(); i++) {
251                         
252                         //parse bin 
253                         string names = list->get(i);
254                         while (names.find_first_of(',') != -1) { 
255                                 //get name from bin
256                                 string name = names.substr(0,names.find_first_of(','));
257                                 //save name and bin number
258                                 seq2Bin[name] = i;
259                                 names = names.substr(names.find_first_of(',')+1, names.length());
260                         }
261                         
262                         //get last name
263                         seq2Bin[names] = i;
264                 }
265                 
266         }
267         catch(exception& e) {
268                 errorOut(e, "Cluster", "setMapWanted");
269                 exit(1);
270         }
271 }
272 /***********************************************************************/
273 void Cluster::updateMap() {
274 try {
275                 //update location of seqs in smallRow since they move to smallCol now
276                 string names = list->get(smallRow);
277                 while (names.find_first_of(',') != -1) { 
278                         //get name from bin
279                         string name = names.substr(0,names.find_first_of(','));
280                         //save name and bin number
281                         seq2Bin[name] = smallCol;
282                         names = names.substr(names.find_first_of(',')+1, names.length());
283                 }
284                         
285                 //get last name
286                 seq2Bin[names] = smallCol;
287                 
288         }
289         catch(exception& e) {
290                 errorOut(e, "Cluster", "updateMap");
291                 exit(1);
292         }
293 }
294 /***********************************************************************/
295
296
297