5 // Created by SarahsWork on 12/10/13.
6 // Copyright (c) 2013 Schloss Lab. All rights reserved.
10 #define DBL_EPSILON 1e-9
12 /**************************************************************************************************/
13 Pam::Pam(vector<vector<int> > c, vector<vector<double> > d, int p) : CommunityTypeFinder() {
16 numSamples = (int)d.size();
17 numOTUs = (int)c[0].size();
22 for (int i = 0; i < dists.size(); i++) {
23 for (int j = i; j < dists.size(); j++) {
24 if (m->control_pressed) { break; }
25 if (dists[i][j] > largestDist) { largestDist = dists[i][j]; }
29 buildPhase(); //choosing the medoids
30 swapPhase(); //optimize clusters
33 m->errorOut(e, "Pam", "Pam");
37 /**************************************************************************************************/
38 //sets Dp[0] does not set Dp[1]. chooses intial medoids.
39 int Pam::buildPhase() {
42 if (m->debug) { m->mothurOut("[DEBUG]: building medoids\n"); }
44 vector<double> gains; gains.resize(numSamples);
46 largestDist *= 1.1 + 1; //make this distance larger than any distance in the matrix
47 Dp.resize(numSamples);
48 for (int i = 0; i < numSamples; i++) { Dp[i].push_back(largestDist); Dp[i].push_back(largestDist); } //2 smallest dists for this sample in this partition
50 zMatrix.resize(numPartitions);
51 for(int i=0;i<numPartitions;i++){
52 zMatrix[i].assign(numSamples, 0);
55 for (int k = 0; k < numPartitions; k++) {
58 double totalGain = 0.0;
59 double clusterGain = 0.0;
61 for (int i = 0; i < numSamples; i++) { //does this need to be square?? can we do lt?
62 if (m->control_pressed) { break; }
64 if (medoids.count(i) == 0) { //is this sample is NOT a medoid?
67 for (int j = 0; j < numSamples; j++) {
68 //cout << i << '\t' << j << '\t' << Dp[i][0] << '\t' << dists[i][j] << '\t' << totalGain << endl;
69 totalGain = Dp[i][0] - dists[i][j];
70 if (totalGain > 0.0) { gains[i] += totalGain; }
72 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) + " totalGain = " + toString(totalGain) + "\n"); }
74 if (clusterGain <= gains[i]) {
75 clusterGain = gains[i];
82 medoids.insert(medoid);
84 if (m->debug) { m->mothurOut("[DEBUG]: new medoid " + toString(medoid) + "\n"); }
87 for (int i = 0; i < numSamples; i++) {
88 if (Dp[i][0] > dists[i][medoid]) { Dp[i][0] = dists[i][medoid]; }
91 if (m->debug) { m->mothurOut("[DEBUG]: done building medoids\n"); }
95 m->errorOut(e, "Pam", "buildPhase");
99 /**************************************************************************************************/
100 //goal to swap medoids with non-medoids to see if we can reduce the overall cost
101 int Pam::swapPhase() {
103 if (m->debug) { m->mothurOut("[DEBUG]: swapping medoids\n"); }
104 //calculate cost of initial choice - average distance of samples to their closest medoid
107 for (int i = 0; i < numSamples; i++) { sky += Dp[i][0]; } sky /= (double) numSamples;
110 int hbest, nbest; hbest = -1; nbest = -1;
112 if (m->control_pressed) { break; }
118 for (int h = 0; h < numSamples; h++) {
119 if (m->control_pressed) { break; }
120 if (medoids.count(h) == 0) { //this is NOT a medoid
121 for (int i = 0; i < numSamples; i++) {
122 if (medoids.count(i) != 0) { //this is a medoid
124 double dz = 0.0; //Tih sum of distances between objects and closest medoid caused by swapping i and h. Basically the change in cost. If this < 0 its a "good" swap. When all Tih are > 0, then we stop the algo, because we have the optimal medoids.
125 for (int j = 0; j < numSamples; j++) {
126 if (m->control_pressed) { break; }
127 if (dists[i][j] == Dp[j][0]) {
129 if (Dp[j][1] > dists[h][j]) { small = dists[h][j]; }
130 else { small = Dp[j][1]; }
131 dz += (small - Dp[j][0]);
132 }else if (dists[h][j] < Dp[j][0]) {
133 dz += (dists[h][j] - Dp[j][0]);
146 if (dzsky < -16 *DBL_EPSILON * fabs(sky)) {
147 medoids.insert(hbest);
148 medoids.erase(nbest);
149 if (m->debug) { m->mothurOut("[DEBUG]: swapping " + toString(hbest) + " " + toString(nbest) + "\n"); }
151 }else { done = true; } //stop algo.
157 vector<int> tempMedoids;
158 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
159 medoid2Partition[*it] = count;
160 zMatrix[count][*it] = 1; count++; //set medoid in this partition.
161 tempMedoids.push_back(*it);
164 //which partition do you belong to?
166 for (int i = 0; i < numSamples; i++) {
168 double dist = dists[i][tempMedoids[0]]; //assign to first medoid
169 for (int j = 1; j < tempMedoids.size(); j++) {
170 if (dists[i][tempMedoids[j]] < dist) { //is this medoid closer?
171 dist = dists[i][tempMedoids[j]];
175 zMatrix[partition][i] = 1;
178 laplace /= (double) numSamples;
181 for(int i=0;i<numPartitions;i++){
182 m->mothurOut("[DEBUG]: partition 1: ");
183 for (int j = 0; j < numSamples; j++) {
184 m->mothurOut(toString(zMatrix[i][j]) + " ");
188 m->mothurOut("[DEBUG]: medoids : ");
189 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) { m->mothurOut(toString(*it) + " ");
193 m->mothurOut("[DEBUG]: laplace : " + toString(laplace)); m->mothurOut("\n");
196 if (m->debug) { m->mothurOut("[DEBUG]: done swapping medoids\n"); }
199 catch(exception& e) {
200 m->errorOut(e, "Pam", "swapPhase");
205 /**************************************************************************************************/
206 int Pam::updateDp() {
208 for (int j = 0; j < numSamples; j++) {
209 if (m->control_pressed) { break; }
211 //initialize dp and ep
212 Dp[j][0] = largestDist; Dp[j][1] = largestDist;
214 for (int i = 0; i < numSamples; i++) {
215 if (medoids.count(i) != 0) { //is this a medoid?
216 if (Dp[j][0] > dists[j][i]) {
218 Dp[j][0] = dists[j][i];
219 }else if (Dp[j][1] > dists[j][i]) {
220 Dp[j][1] = dists[j][i];
227 catch(exception& e) {
228 m->errorOut(e, "Pam", "updateDp");
232 /**************************************************************************************************/
233 //The silhouette width S(i)of individual data points i is calculated using the following formula:
238 where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
240 The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
243 vector<double> Pam::calcSilhouettes(vector<vector<double> > dists) {
245 vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
246 if (numPartitions < 2) { return silhouettes; }
249 for (int i = 0; i < numSamples; i++) { // assumes at least 2 partitions
250 vector<seqDist> tempMeds;
251 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
252 if (m->control_pressed) { break; }
253 seqDist temp(*it, *it, dists[i][*it]); //medoid, medoid, distance from medoid to sample
254 tempMeds.push_back(temp);
256 sort(tempMeds.begin(), tempMeds.end(), compareSequenceDistance); //sort by distance
257 whoDp.push_back(tempMeds[1].seq1); //second closest medoid
261 vector<double> a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0);
263 //calc a - all a[i] are the same in the same partition
264 for (int k = 0; k < numPartitions; k++) {
265 if (m->control_pressed) { break; }
269 for (int i = 0; i < numSamples; i++) {
270 for (int j = 0; j < numSamples; j++) {
271 if (m->control_pressed) { break; }
272 if ((zMatrix[k][i] != 0) && (zMatrix[k][j] != 0)){ //are both samples in the partition, if so add there distance
273 totalA += dists[j][i]; //distance from this sample to the other samples in the partition
278 totalA /= (double) count;
280 //set a[i] to average for cluster
281 for (int i = 0; i < numSamples; i++) {
282 if (zMatrix[k][i] != 0) { a[i] = totalA; }
287 for (int i = 0; i < numSamples; i++) {
288 if (m->control_pressed) { break; }
290 int nextClosestMedoid = whoDp[i];
291 int thisPartition = medoid2Partition[nextClosestMedoid];
294 for (int j = 0; j < numSamples; j++) {
295 if (zMatrix[thisPartition][j] != 0) { //this sample is in this partition
296 totalB += dists[i][j];
300 b[i] = totalB / (double) count;
304 for (int i = 0; i < numSamples; i++) {
305 if (m->control_pressed) { break; }
308 if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i])
310 silhouettes[i] = (b[i] - a[i]) / denom;
312 //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl;
317 catch(exception& e) {
318 m->errorOut(e, "Pam", "calcSilhouettes");
322 /**************************************************************************************************/
323 /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
325 CHk=Bk/(k−1)/Wk/(n−k)
327 where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
328 double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSamples][numOtus]
332 if (numPartitions < 2) { return CH; }
334 map<int, int> clusterMap; //map sample to partition
335 for (int i = 0; i < numPartitions; i++) {
336 for (int j = 0; j < numSamples; j++) {
337 if (m->control_pressed) { return 0.0; }
338 if (zMatrix[i][j] != 0) { clusterMap[j] = i; }
342 double sumBetweenCluster = 0.0;
343 double sumWithinClusters = 0.0;
345 for (int i = 0; i < numSamples; i++) { //lt
346 for (int j = 0; j < i; j++) {
347 if (m->control_pressed) { return 0.0; }
348 int partitionI = clusterMap[i];
349 int partitionJ = clusterMap[j];
351 if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk
352 sumWithinClusters += (dists[i][j] * dists[i][j]);
353 }else { //they are NOT from the same cluster so this distance is added to Bk
354 sumBetweenCluster += (dists[i][j] * dists[i][j]);
358 //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' << (numSamples - numPartitions) << endl;
360 CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions));
365 m->errorOut(e, "Pam", "calcCHIndex");
370 /**************************************************************************************************/