]> git.donarmstrong.com Git - mothur.git/blob - hcluster.cpp
precluster command finished
[mothur.git] / hcluster.cpp
1 /*
2  *  hcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/13/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "hcluster.h"
11 #include "rabundvector.hpp"
12 #include "listvector.hpp"
13 #include "sparsematrix.hpp"
14
15
16 /***********************************************************************/
17
18 HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) :  rabund(rav), list(lv), method(m){
19         try {
20                 mapWanted = false;
21                 exitedBreak = false; 
22                 numSeqs = list->getNumSeqs();
23                 
24                 //initialize cluster array
25                 for (int i = 0; i < numSeqs; i++) {
26                         clusterNode temp(1, -1, i);
27                         clusterArray.push_back(temp);
28                 }
29                 
30         }
31         catch(exception& e) {
32                 errorOut(e, "HCluster", "HCluster");
33                 exit(1);
34         }
35 }
36 /***********************************************************************/
37
38 void HCluster::clusterBins(){
39         try {
40                 //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
41
42                 rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));  
43                 rabund->set(clusterArray[smallRow].smallChild, 0);      
44                 rabund->setLabel(toString(smallDist));
45
46                 //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
47         }
48         catch(exception& e) {
49                 errorOut(e, "HCluster", "clusterBins");
50                 exit(1);
51         }
52
53
54 }
55
56 /***********************************************************************/
57
58 void HCluster::clusterNames(){
59         try {
60                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
61                 if (mapWanted) {  updateMap();  }
62                 
63                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
64                 list->set(clusterArray[smallRow].smallChild, "");       
65                 list->setLabel(toString(smallDist));
66         
67                 //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
68
69     }
70         catch(exception& e) {
71                 errorOut(e, "HCluster", "clusterNames");
72                 exit(1);
73         }
74
75 }
76 /***********************************************************************/
77 int HCluster::getUpmostParent(int node){
78         try {
79                 
80                 while (clusterArray[node].parent != -1) {
81                         node = clusterArray[node].parent;
82                 }
83                 
84                 return node;
85         }
86         catch(exception& e) {
87                 errorOut(e, "HCluster", "getUpmostParent");
88                 exit(1);
89         }
90 }
91 /***********************************************************************/
92 void HCluster::printInfo(){
93         try {
94                 
95                 cout << "link table" << endl;
96                 for (it = activeLinks.begin(); it!= activeLinks.end(); it++) {
97                         cout << it->first << " = " << it->second << endl;
98                 }
99                 cout << endl;
100                 for (int i = 0; i < linkTable.size(); i++) {
101                         cout << i << '\t';
102                         for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
103                                 cout << it->first << '-' << it->second << '\t';
104                         }
105                         cout << endl;
106                 }
107                 cout << endl << "clusterArray" << endl;
108                 
109                 for (int i = 0; i < clusterArray.size(); i++) {
110                         cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
111                 }
112                 cout << endl;
113                 
114                 
115         }
116         catch(exception& e) {
117                 errorOut(e, "HCluster", "getUpmostParent");
118                 exit(1);
119         }
120 }
121 /***********************************************************************/
122 int HCluster::makeActive() {
123         try {
124         
125                 int linkValue = 1; 
126 //cout << "active - here" << endl;              
127                 it = activeLinks.find(smallRow);
128                 it2 = activeLinks.find(smallCol);
129                 
130                 if ((it == activeLinks.end()) && (it2 == activeLinks.end())) { //both are not active so add them
131                         int size = linkTable.size();
132                         map<int, int> temp; map<int, int> temp2;
133                         
134                         //add link to eachother
135                         temp[smallRow] = 1;                                                     //         1    2
136                         temp2[smallCol] = 1;                                            // 1   0        1
137                                                                                                                 // 2   1        0
138                         linkTable.push_back(temp);
139                         linkTable.push_back(temp2);
140                         
141                         //add to activeLinks
142                         activeLinks[smallRow] = size;
143                         activeLinks[smallCol] = size+1;
144 //cout << "active - here1" << endl;
145                 }else if ((it != activeLinks.end()) && (it2 == activeLinks.end())) {  //smallRow is active, smallCol is not
146                          int size = linkTable.size();
147                          int alreadyActiveRow = it->second;
148                          map<int, int> temp; 
149                         
150                         //add link to eachother
151                         temp[smallRow] = 1;                                                     //         6    2       3       5
152                         linkTable.push_back(temp);                                      // 6   0        1       2       0
153                         linkTable[alreadyActiveRow][smallCol] = 1;      // 2   1        0       1       1
154                                                                                                                 // 3   2        1       0       0
155                                                                                                                 // 5   0    1   0   0   
156                         //add to activeLinks
157                         activeLinks[smallCol] = size;
158 //cout << "active - here2" << endl;                     
159                 }else if ((it == activeLinks.end()) && (it2 != activeLinks.end())) {  //smallCol is active, smallRow is not
160                          int size = linkTable.size();
161                          int alreadyActiveCol = it2->second;
162                          map<int, int> temp; 
163                         
164                         //add link to eachother
165                         temp[smallCol] = 1;                                                     //         6    2       3       5
166                         linkTable.push_back(temp);                                      // 6   0        1       2       0
167                         linkTable[alreadyActiveCol][smallRow] = 1;      // 2   1        0       1       1
168                                                                                                                 // 3   2        1       0       0
169                                                                                                                 // 5   0    1   0   0   
170                         //add to activeLinks
171                         activeLinks[smallRow] = size;
172 //cout << "active - here3" << endl;
173                 }else { //both are active so add one
174                         int row = it->second;
175                         int col = it2->second;
176 //cout << row << '\t' << col << endl;                   
177                         
178                         linkTable[row][smallCol]++;
179                         linkTable[col][smallRow]++;
180                         linkValue = linkTable[row][smallCol];
181 //cout << "active - here4" << endl;
182                 }
183                 
184                 return linkValue;
185         }
186         catch(exception& e) {
187                 errorOut(e, "HCluster", "makeActive");
188                 exit(1);
189         }
190 }
191 /***********************************************************************/
192 void HCluster::updateArrayandLinkTable() {
193         try {
194                 //if cluster was made update clusterArray and linkTable
195                         int size = clusterArray.size();
196                         
197                         //add new node
198                         clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
199                         clusterArray.push_back(temp);
200                         
201                         //update child nodes
202                         clusterArray[smallRow].parent = size;
203                         clusterArray[smallCol].parent = size;
204                         
205                         //update linkTable by merging clustered rows and columns
206                         int rowSpot = activeLinks[smallRow];
207                         int colSpot = activeLinks[smallCol];
208         //cout << "here" << endl;               
209                         //fix old rows
210                         for (int i = 0; i < linkTable.size(); i++) {
211                                 //check if they are in map
212                                 it = linkTable[i].find(smallRow);
213                                 it2 = linkTable[i].find(smallCol);
214                                 
215                                 if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
216                                         linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
217                                         linkTable[i].erase(smallCol); //delete col row
218                                         linkTable[i].erase(smallRow); //delete col row
219                                 }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
220                                         linkTable[i][size] = linkTable[i][smallCol];
221                                         linkTable[i].erase(smallCol); //delete col 
222                                 }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
223                                         linkTable[i][size] = linkTable[i][smallRow];
224                                         linkTable[i].erase(smallRow); //delete col 
225                                 }
226                         }
227         //printInfo();
228 //cout << "here2" << endl;
229                         //merge their values
230                         for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
231                                 it2 = linkTable[colSpot].find(it->first);  //does the col also have this
232                                 
233                                 if (it2 == linkTable[colSpot].end()) { //not there so add it
234                                         linkTable[colSpot][it->first] = it->second;
235                                 }else { //merge them
236                                         linkTable[colSpot][it->first] = it->second+it2->second;
237                                 }
238                         }
239 //cout << "here3" << endl;                      
240                         linkTable[colSpot].erase(size);
241                         linkTable.erase(linkTable.begin()+rowSpot);  //delete row
242         //printInfo();          
243                         //update activerows
244                         activeLinks.erase(smallRow);
245                         activeLinks.erase(smallCol);
246                         activeLinks[size] = colSpot;
247                         
248                         //adjust everybody elses spot since you deleted - time vs. space
249                         for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
250                                 if (it->second > rowSpot) {  activeLinks[it->first]--;  }
251                         }
252                         
253 //cout << "here4" << endl;
254         
255         }
256         catch(exception& e) {
257                 errorOut(e, "HCluster", "updateArrayandLinkTable");
258                 exit(1);
259         }
260 }
261 /***********************************************************************/
262 void HCluster::update(int row, int col, float distance){
263         try {
264                 
265                 smallRow = row;
266                 smallCol = col;
267                 smallDist = distance;
268                 
269                 //find upmost parent of row and col
270                 smallRow = getUpmostParent(smallRow);
271                 smallCol = getUpmostParent(smallCol);
272         //cout << "row = " << row << " smallRow = " << smallRow <<  " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
273                 //you don't want to cluster with yourself
274                 if (smallRow != smallCol) {
275                         //are they active in the link table
276                         int linkValue = makeActive(); //after this point this nodes info is active in linkTable
277                         //printInfo();                  
278                         //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
279                         //can we cluster???
280                         bool cluster = false;
281                         
282                         if (method == "nearest") { cluster = true;  }
283                         else if (method == "average") { cout << "still working on this... " << endl; //got to figure this out 
284                         }else{ //assume furthest
285                                 if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
286                         }
287                         
288                         if (cluster) { 
289                                 updateArrayandLinkTable();
290                                 clusterBins();
291                                 clusterNames();
292                         }
293                 }
294                 //printInfo();
295         }
296         catch(exception& e) {
297                 errorOut(e, "HCluster", "update");
298                 exit(1);
299         }
300 }
301 /***********************************************************************/
302 void HCluster::setMapWanted(bool m)  {  
303         try {
304                 mapWanted = m;
305                 
306                 //initialize map
307                 for (int i = 0; i < list->getNumBins(); i++) {
308                         
309                         //parse bin 
310                         string names = list->get(i);
311                         while (names.find_first_of(',') != -1) { 
312                                 //get name from bin
313                                 string name = names.substr(0,names.find_first_of(','));
314                                 //save name and bin number
315                                 seq2Bin[name] = i;
316                                 names = names.substr(names.find_first_of(',')+1, names.length());
317                         }
318                         
319                         //get last name
320                         seq2Bin[names] = i;
321                 }
322                 
323         }
324         catch(exception& e) {
325                 errorOut(e, "HCluster", "setMapWanted");
326                 exit(1);
327         }
328 }
329 /***********************************************************************/
330 void HCluster::updateMap() {
331 try {
332                 //update location of seqs in smallRow since they move to smallCol now
333                 string names = list->get(clusterArray[smallRow].smallChild);
334                 while (names.find_first_of(',') != -1) { 
335                         //get name from bin
336                         string name = names.substr(0,names.find_first_of(','));
337                         //save name and bin number
338                         seq2Bin[name] = clusterArray[smallCol].smallChild;
339                         names = names.substr(names.find_first_of(',')+1, names.length());
340                 }
341                         
342                 //get last name
343                 seq2Bin[names] = clusterArray[smallCol].smallChild;
344         }
345         catch(exception& e) {
346                 errorOut(e, "HCluster", "updateMap");
347                 exit(1);
348         }
349 }
350 //**********************************************************************************************************************
351 vector<seqDist> HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){
352         try {
353                 string firstName, secondName;
354                 float distance, prevDistance;
355                 vector<seqDist> sameSeqs;
356                 prevDistance = -1;
357         
358                 //if you are not at the beginning of the file
359                 if (exitedBreak) { 
360                         sameSeqs.push_back(next);
361                         prevDistance = next.dist;
362                         exitedBreak = false;
363                 }
364         
365                 //get entry
366                 while (!filehandle.eof()) {
367                         
368                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
369         
370                         //save first one
371                         if (prevDistance == -1) { prevDistance = distance; }
372                         
373                         map<string,int>::iterator itA = nameMap->find(firstName);
374                         map<string,int>::iterator itB = nameMap->find(secondName);
375                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
376                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
377                 
378                         //using cutoff
379                         if (distance > cutoff) { break; }
380                 
381                         if (distance != -1) { //-1 means skip me
382                         
383                                 //are the distances the same
384                                 if (distance == prevDistance) { //save in vector
385                                         seqDist temp(itA->second, itB->second, distance);
386                                         sameSeqs.push_back(temp);
387                                         exitedBreak = false;
388                                 }else{ 
389                                         next.seq1 = itA->second;
390                                         next.seq2 = itB->second;
391                                         next.dist = distance;
392                                         exitedBreak = true;
393                                         break;
394                                 }
395                         }
396                 }
397                 
398                 //rndomize matching dists
399                 random_shuffle(sameSeqs.begin(), sameSeqs.end());
400                 
401                 return sameSeqs;
402         }
403         catch(exception& e) {
404                 errorOut(e, "HCluster", "getSeqs");
405                 exit(1);
406         }
407 }
408 /***********************************************************************/
409
410
411