5 // Created by SarahsWork on 12/10/13.
6 // Copyright (c) 2013 Schloss Lab. All rights reserved.
10 #define DBL_EPSILON 1e-9
12 /**************************************************************************************************/
13 Pam::Pam(vector<vector<int> > c, vector<vector<double> > d, int p) : CommunityTypeFinder() {
16 numSamples = (int)d.size();
17 numOTUs = (int)c[0].size();
22 for (int i = 0; i < dists.size(); i++) {
23 for (int j = i; j < dists.size(); j++) {
24 if (m->control_pressed) { break; }
25 if (dists[i][j] > largestDist) { largestDist = dists[i][j]; }
29 buildPhase(); //choosing the medoids
30 swapPhase(); //optimize clusters
33 m->errorOut(e, "Pam", "Pam");
37 /**************************************************************************************************/
38 //sets Dp[0] does not set Dp[1]. chooses intial medoids.
39 int Pam::buildPhase() {
42 if (m->debug) { m->mothurOut("[DEBUG]: building medoids\n"); }
44 vector<double> gains; gains.resize(numSamples);
46 largestDist *= 1.1 + 1; //make this distance larger than any distance in the matrix
47 Dp.resize(numSamples);
48 for (int i = 0; i < numSamples; i++) { Dp[i].push_back(largestDist); Dp[i].push_back(largestDist); } //2 smallest dists for this sample in this partition
50 zMatrix.resize(numPartitions);
51 for(int i=0;i<numPartitions;i++){
52 zMatrix[i].assign(numSamples, 0);
55 for (int k = 0; k < numPartitions; k++) {
58 double totalGain = 0.0;
59 double clusterGain = 0.0;
61 for (int i = 0; i < numSamples; i++) { //does this need to be square?? can we do lt?
62 if (m->control_pressed) { break; }
64 if (medoids.count(i) == 0) { //is this sample is NOT a medoid?
67 for (int j = 0; j < numSamples; j++) {
68 totalGain = Dp[j][0] - dists[i][j];
69 if (totalGain > 0.0) { gains[i] += totalGain; }
71 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) + " totalGain = " + toString(totalGain) + "\n"); }
73 if (clusterGain <= gains[i]) {
74 clusterGain = gains[i];
81 medoids.insert(medoid);
83 if (m->debug) { m->mothurOut("[DEBUG]: new medoid " + toString(medoid) + "\n"); }
86 for (int i = 0; i < numSamples; i++) {
87 if (Dp[i][0] > dists[i][medoid]) { Dp[i][0] = dists[i][medoid]; }
90 if (m->debug) { m->mothurOut("[DEBUG]: done building medoids\n"); }
94 m->errorOut(e, "Pam", "buildPhase");
98 /**************************************************************************************************/
99 //goal to swap medoids with non-medoids to see if we can reduce the overall cost
100 int Pam::swapPhase() {
102 if (m->debug) { m->mothurOut("[DEBUG]: swapping medoids\n"); }
103 //calculate cost of initial choice - average distance of samples to their closest medoid
106 for (int i = 0; i < numSamples; i++) { sky += Dp[i][0]; } //sky /= (double) numSamples;
109 int hbest, nbest; hbest = -1; nbest = -1;
111 if (m->control_pressed) { break; }
117 for (int h = 0; h < numSamples; h++) {
118 if (m->control_pressed) { break; }
119 if (medoids.count(h) == 0) { //this is NOT a medoid
120 for (int i = 0; i < numSamples; i++) {
121 if (medoids.count(i) != 0) { //this is a medoid
123 double dz = 0.0; //Tih sum of distances between objects and closest medoid caused by swapping i and h. Basically the change in cost. If this < 0 its a "good" swap. When all Tih are > 0, then we stop the algo, because we have the optimal medoids.
124 for (int j = 0; j < numSamples; j++) {
125 if (m->control_pressed) { break; }
126 if (dists[i][j] == Dp[j][0]) {
128 if (Dp[j][1] > dists[h][j]) { small = dists[h][j]; }
129 else { small = Dp[j][1]; }
130 dz += (- Dp[j][0]+ small);
131 }else if (dists[h][j] < Dp[j][0]) {
132 dz += (- Dp[j][0] + dists[h][j]);
145 if (dzsky < -16 *DBL_EPSILON * fabs(sky)) {
146 medoids.insert(hbest);
147 medoids.erase(nbest);
148 if (m->debug) { m->mothurOut("[DEBUG]: swapping " + toString(hbest) + " " + toString(nbest) + "\n"); }
150 }else { done = true; } //stop algo.
156 vector<int> tempMedoids;
157 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
158 medoid2Partition[*it] = count;
159 zMatrix[count][*it] = 1; count++; //set medoid in this partition.
160 tempMedoids.push_back(*it);
163 //which partition do you belong to?
165 for (int i = 0; i < numSamples; i++) {
167 double dist = dists[i][tempMedoids[0]]; //assign to first medoid
168 for (int j = 1; j < tempMedoids.size(); j++) {
169 if (dists[i][tempMedoids[j]] < dist) { //is this medoid closer?
170 dist = dists[i][tempMedoids[j]];
174 zMatrix[partition][i] = 1;
177 laplace /= (double) numSamples;
180 for(int i=0;i<numPartitions;i++){
181 m->mothurOut("[DEBUG]: partition 1: ");
182 for (int j = 0; j < numSamples; j++) {
183 m->mothurOut(toString(zMatrix[i][j]) + " ");
187 m->mothurOut("[DEBUG]: medoids : ");
188 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) { m->mothurOut(toString(*it) + " ");
192 m->mothurOut("[DEBUG]: laplace : " + toString(laplace)); m->mothurOut("\n");
195 if (m->debug) { m->mothurOut("[DEBUG]: done swapping medoids\n"); }
198 catch(exception& e) {
199 m->errorOut(e, "Pam", "swapPhase");
204 /**************************************************************************************************/
205 int Pam::updateDp() {
207 for (int j = 0; j < numSamples; j++) {
208 if (m->control_pressed) { break; }
210 //initialize dp and ep
211 Dp[j][0] = largestDist; Dp[j][1] = largestDist;
213 for (int i = 0; i < numSamples; i++) {
214 if (medoids.count(i) != 0) { //is this a medoid?
215 if (Dp[j][0] > dists[j][i]) {
217 Dp[j][0] = dists[j][i];
218 }else if (Dp[j][1] > dists[j][i]) {
219 Dp[j][1] = dists[j][i];
227 catch(exception& e) {
228 m->errorOut(e, "Pam", "updateDp");
233 /**************************************************************************************************/
234 /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
236 CHk=Bk/(k−1)/Wk/(n−k)
238 where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
239 //based on R index.G1.r function
240 double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSamples][numOtus]
244 if (numPartitions < 2) { return CH; }
246 map<int, int> clusterMap; //map sample to partition
247 for (int i = 0; i < numPartitions; i++) {
248 for (int j = 0; j < numSamples; j++) {
249 if (m->control_pressed) { return 0.0; }
250 if (zMatrix[i][j] != 0) { clusterMap[j] = i; }
254 //make countMatrix a relabund
255 vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
256 //get relative abundance
257 for(int i=0;i<numSamples;i++){
258 if (m->control_pressed) { return 0; }
261 relativeAbundance[i].assign(numOTUs, 0.0);
263 for(int j=0;j<numOTUs;j++){
264 groupTotal += countMatrix[i][j];
266 for(int j=0;j<numOTUs;j++){
267 relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
272 vector<vector<double> > centers; centers.resize(numPartitions);
273 int countPartitions = 0;
274 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
275 for (int j = 0; j < numOTUs; j++) {
276 centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU
282 //centers = calcCenters(dists, clusterMap, relativeAbundance);
284 double allMeanDist = rMedoid(relativeAbundance, dists);
286 if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
288 for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
289 for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
290 if (m->control_pressed) { return 0; }
291 //x <- (x - centers[cl, ])^2
292 relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
297 for (int j = 0; j < numOTUs; j++) {
298 for(int i=0;i<numSamples;i++){
299 if (m->control_pressed) { return 0.0; }
300 wgss += relativeAbundance[i][j];
304 double bgss = allMeanDist - wgss;
306 CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
311 m->errorOut(e, "Pam", "calcCHIndex");
316 /**************************************************************************************************/