]> git.donarmstrong.com Git - mothur.git/blob - cluster.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / cluster.cpp
1 /*
2  *  cluster.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/14/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "cluster.hpp"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15 /***********************************************************************/
16
17 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) :
18 rabund(rav), list(lv), dMatrix(dm), method(f)
19 {
20 /*
21         cout << "sizeof(MatData): " << sizeof(MatData) << endl;
22         cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
23
24         int nCells = dMatrix->getNNodes();
25         time_t start = time(NULL);
26
27         MatVec matvec = MatVec(nCells); 
28         int i = 0;
29         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
30                 matvec[i++] = currentCell;
31         }
32         for (i= matvec.size();i>0;i--) {
33                 dMatrix->rmCell(matvec[i-1]);
34         }
35         MatData it = dMatrix->begin(); 
36         while (it != dMatrix->end()) { 
37                 it = dMatrix->rmCell(it);
38         }
39         cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
40     exit(0);
41         MatData it = dMatrix->begin();
42         cout << it->row << "/" << it->column << "/" << it->dist << endl;
43         dMatrix->rmCell(dMatrix->begin());
44         cout << it->row << "/" << it->column << "/" << it->dist << endl;
45         exit(0);
46 */
47
48         // Create a data structure to quickly access the PCell information
49         // for a certain sequence. It consists of a vector of lists, where 
50         // a list contains pointers (iterators) to the all distances related
51         // to a certain sequence. The Vector is accessed via the index of a 
52         // sequence in the distance matrix.
53         
54
55         seqVec = vector<MatVec>(lv->size());
56         for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
57                 seqVec[currentCell->row].push_back(currentCell);
58                 seqVec[currentCell->column].push_back(currentCell);
59         }
60
61         mapWanted = false;  //set to true by mgcluster to speed up overlap merge
62         
63         //save so you can modify as it changes in average neighbor
64         cutoff = c;
65         m = MothurOut::getInstance();
66 }
67
68 /***********************************************************************/
69
70 void Cluster::getRowColCells() {
71         try {
72                 PCell* smallCell = dMatrix->getSmallestCell();  //find the smallest cell - this routine should probably not be in the SpMat class
73         
74                 smallRow = smallCell->row;              // get its row
75                 smallCol = smallCell->column;   // get its column
76                 smallDist = smallCell->dist;    // get the smallest distance
77         
78                 rowCells = seqVec[smallRow];    // all distances related to the row index
79                 colCells = seqVec[smallCol];    // all distances related to the column index
80                 nRowCells = rowCells.size();
81                 nColCells = colCells.size();
82         }
83         catch(exception& e) {
84                 m->errorOut(e, "Cluster", "getRowColCells");
85                 exit(1);
86         }
87
88 }
89 /***********************************************************************/
90 // Remove the specified cell from the seqVec and from the sparse
91 // matrix
92 void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){
93         try {
94         
95                 ull drow = cell->row;
96                         ull dcol = cell->column;
97                         if (((vrow >=0) && (drow != smallRow)) ||
98                                 ((vcol >=0) && (dcol != smallCol))) {
99                                 ull dtemp = drow;
100                                 drow = dcol;
101                                 dcol = dtemp;
102                         }
103
104                         ull crow;
105                         ull ccol;
106                         int nCells;
107                         if (vrow < 0) {
108                                 nCells = seqVec[drow].size();
109                                 for (vrow=0; vrow<nCells;vrow++) {
110                                         crow = seqVec[drow][vrow]->row;
111                                         ccol = seqVec[drow][vrow]->column;
112                                         if (((crow == drow) && (ccol == dcol)) ||
113                                                 ((ccol == drow) && (crow == dcol))) {
114                                                 break;
115                                         }
116                                 }
117                         }
118
119                         seqVec[drow].erase(seqVec[drow].begin()+vrow);
120                         if (vcol < 0) {
121                                 nCells = seqVec[dcol].size();
122                                 for (vcol=0; vcol<nCells;vcol++) {
123                                         crow = seqVec[dcol][vcol]->row;
124                                         ccol = seqVec[dcol][vcol]->column;
125                                         if (((crow == drow) && (ccol == dcol)) ||
126                                                 ((ccol == drow) && (crow == dcol))) {
127                                                 break;
128                                         }
129                                 }
130                         }
131                 
132                         seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
133                 
134                         if (rmMatrix) {
135                                 dMatrix->rmCell(cell);
136                         }
137                 
138         }
139         catch(exception& e) {
140                 m->errorOut(e, "Cluster", "removeCell");
141                 exit(1);
142         }
143 }
144 /***********************************************************************/
145
146 void Cluster::clusterBins(){
147         try {
148         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
149
150                 rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));     
151                 rabund->set(smallRow, 0);       
152                 rabund->setLabel(toString(smallDist));
153
154         //      cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "Cluster", "clusterBins");
158                 exit(1);
159         }
160
161
162 }
163
164 /***********************************************************************/
165
166 void Cluster::clusterNames(){
167         try {
168         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
169                 if (mapWanted) {  updateMap();  }
170                 
171                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
172                 list->set(smallRow, "");        
173                 list->setLabel(toString(smallDist));
174         
175         //      cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
176     }
177         catch(exception& e) {
178                 m->errorOut(e, "Cluster", "clusterNames");
179                 exit(1);
180         }
181
182 }
183
184 /***********************************************************************/
185 //This function clusters based on the method of the derived class
186 //At the moment only average and complete linkage are covered, because
187 //single linkage uses a different approach.
188 void Cluster::update(double& cutOFF){
189         try {
190                 getRowColCells();       
191
192                 vector<int> foundCol(nColCells, 0);
193
194                 int search;
195                 bool changed;
196
197                 // The vector has to be traversed in reverse order to preserve the index
198                 // for faster removal in removeCell()
199                 for (int i=nRowCells-1;i>=0;i--) {
200                         if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
201                                 if (rowCells[i]->row == smallRow) {
202                                         search = rowCells[i]->column;
203                                 } else {
204                                         search = rowCells[i]->row;
205                                 }
206                                 
207                                 bool merged = false;
208                                 for (int j=0;j<nColCells;j++) {
209                                         if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
210                                                 if (colCells[j]->row == search || colCells[j]->column == search) {
211                                                         foundCol[j] = 1;
212                                                         merged = true;
213                                                         changed = updateDistance(colCells[j], rowCells[i]);
214                                                         // If the cell's distance changed and it had the same distance as 
215                                                         // the smallest distance, invalidate the mins vector in SparseMatrix
216                                                         if (changed) {
217                                                                 if (colCells[j]->vectorMap != NULL) {
218                                                                         *(colCells[j]->vectorMap) = NULL;
219                                                                         colCells[j]->vectorMap = NULL;
220                                                                 }
221                                                         }
222                                                         break;
223                                                 }
224                                         }               
225                                 }
226                                 //if not merged it you need it for warning 
227                                 if ((!merged) && (method == "average")) {  
228                                         //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); 
229                                         if (cutOFF > rowCells[i]->dist) {  
230                                                 cutOFF = rowCells[i]->dist;  
231                                                 //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
232                                         }
233
234                                 }
235                                 removeCell(rowCells[i], i , -1);  
236                                 
237                         }
238                 }
239                 clusterBins();
240                 clusterNames();
241
242                 // Special handling for singlelinkage case, not sure whether this
243                 // could be avoided
244                 for (int i=nColCells-1;i>=0;i--) {
245                         if (foundCol[i] == 0) {
246                                 if (method == "average") {
247                                         if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
248                                                 //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine();
249                                                 if (cutOFF > colCells[i]->dist) {  
250                                                         cutOFF = colCells[i]->dist;  
251                                                         //m->mothurOut("changing cutoff to " + toString(cutOFF));  m->mothurOutEndLine(); 
252                                                 }
253                                         }
254                                 }
255                                 removeCell(colCells[i], -1, i);
256                         }
257                 }
258         }
259         catch(exception& e) {
260                 m->errorOut(e, "Cluster", "update");
261                 exit(1);
262         }
263 }
264 /***********************************************************************/
265 void Cluster::setMapWanted(bool f)  {  
266         try {
267                 mapWanted = f;
268                 
269                 //initialize map
270                 for (int i = 0; i < list->getNumBins(); i++) {
271                         
272                         //parse bin 
273                         string names = list->get(i);
274                         while (names.find_first_of(',') != -1) { 
275                                 //get name from bin
276                                 string name = names.substr(0,names.find_first_of(','));
277                                 //save name and bin number
278                                 seq2Bin[name] = i;
279                                 names = names.substr(names.find_first_of(',')+1, names.length());
280                         }
281                         
282                         //get last name
283                         seq2Bin[names] = i;
284                 }
285                 
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "Cluster", "setMapWanted");
289                 exit(1);
290         }
291 }
292 /***********************************************************************/
293 void Cluster::updateMap() {
294 try {
295                 //update location of seqs in smallRow since they move to smallCol now
296                 string names = list->get(smallRow);
297                 while (names.find_first_of(',') != -1) { 
298                         //get name from bin
299                         string name = names.substr(0,names.find_first_of(','));
300                         //save name and bin number
301                         seq2Bin[name] = smallCol;
302                         names = names.substr(names.find_first_of(',')+1, names.length());
303                 }
304                         
305                 //get last name
306                 seq2Bin[names] = smallCol;
307                 
308         }
309         catch(exception& e) {
310                 m->errorOut(e, "Cluster", "updateMap");
311                 exit(1);
312         }
313 }
314 /***********************************************************************/
315
316
317