kmeans.cpp

   1 //
   2 //  kmeans.cpp
   3 //  Mothur
   4 //
   5 //  Created by SarahsWork on 12/4/13.
   6 //  Copyright (c) 2013 Schloss Lab. All rights reserved.
   7 //
   8
   9 #include "kmeans.h"
  10
  11 /**************************************************************************************************/
  12
  13 KMeans::KMeans(vector<vector<int> > cm, int p) : CommunityTypeFinder() {
  14     try {
  15         countMatrix = cm;
  16         numSamples = (int)countMatrix.size();
  17         numOTUs = (int)countMatrix[0].size();
  18         numPartitions = p;
  19
  20         findkMeans();
  21     }
  22         catch(exception& e) {
  23                 m->errorOut(e, "KMeans", "KMeans");
  24                 exit(1);
  25         }
  26 }
  27 /**************************************************************************************************/
  28 //The silhouette width S(i)of individual data points i is calculated using the following formula:
  29 /*
  30  s(i) = b(i) - a(i)
  31  -----------
  32  max(b(i),a(i))
  33  where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
  34
  35  The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
  36  */
  37
  38 vector<double> KMeans::calcSilhouettes(vector<vector<double> > dists) {
  39     try {
  40         vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
  41         if (numPartitions < 2) { return silhouettes; }
  42
  43         map<int, int> clusterMap; //map sample to partition
  44         for (int j = 0; j < numSamples; j++) {
  45             double maxValue = 0.0;
  46             for (int i = 0; i < numPartitions; i++) {
  47                 if (m->control_pressed) { return silhouettes; }
  48                 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
  49                     clusterMap[j] = i;
  50                     maxValue = zMatrix[i][j];
  51                 }
  52             }
  53         }
  54
  55         vector<int> nextClosestPartition;
  56         findSecondClosest(nextClosestPartition, dists, clusterMap);
  57
  58         if (m->control_pressed) { return silhouettes; }
  59
  60         vector<double> a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0);
  61
  62         //calc a - all a[i] are the same in the same partition
  63         for (int k = 0; k < numPartitions; k++) {
  64             if (m->control_pressed) { break; }
  65
  66             int count = 0;
  67             double totalA = 0.0;
  68             for (int i = 0; i < numSamples; i++) {
  69                 for (int j = 0; j < numSamples; j++) {
  70                     if (m->control_pressed) { break; }
  71                     if ((clusterMap[i] == k) && (clusterMap[j] == k)){ //are both samples in the partition, if so add there distance
  72                         totalA += dists[j][i]; //distance from this sample to the other samples in the partition
  73                         count++;
  74                     }
  75                 }
  76             }
  77             totalA /= (double) count;
  78
  79             //set a[i] to average for cluster
  80             for (int i = 0; i < numSamples; i++) {
  81                 if (clusterMap[i] == k) { a[i] = totalA; }
  82             }
  83         }
  84
  85         //calc b
  86         for (int i = 0; i < numSamples; i++) {
  87             if (m->control_pressed) { break; }
  88
  89             int thisPartition = nextClosestPartition[i];
  90             int count = 0;
  91             double totalB = 0.0;
  92             for (int j = 0; j < numSamples; j++) {
  93                 if (clusterMap[j] == thisPartition) { //this sample is in this partition
  94                     totalB += dists[i][j];
  95                     count++;
  96                 }
  97             }
  98             b[i] = totalB / (double) count;
  99         }
 100
 101         //calc silhouettes
 102         for (int i = 0; i < numSamples; i++) {
 103             if (m->control_pressed) { break; }
 104
 105             double denom = a[i];
 106             if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i])
 107
 108             silhouettes[i] = (b[i] - a[i]) / denom;
 109
 110             //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl;
 111         }
 112
 113         return silhouettes;
 114     }
 115     catch(exception& e) {
 116         m->errorOut(e, "KMeans", "calcSilhouettes");
 117         exit(1);
 118     }
 119 }
 120 /**************************************************************************************************/
 121 int KMeans::findSecondClosest(vector<int>& nextClosestPartition, vector<vector<double> >& dists, map<int, int> clusterMap) {
 122     try {
 123         vector<double> minScores; minScores.resize(numSamples, 1e6);
 124         nextClosestPartition.resize(numSamples, 0);
 125
 126
 127         for (int i = 0; i < numSamples; i++) {
 128             for (int j = 0; j < numPartitions; j++) {
 129                 if (m->control_pressed) { break; }
 130
 131                 //is this the one we are assigned to - ie the "best" cluster. We want second best.
 132                 //if numPartitions = 2, then special case??
 133                 if (clusterMap[i] != j) {
 134                     double score = 1e6;
 135                     if (numPartitions == 2) {
 136                         score = 0.0;  //choose other option, there are only 2.
 137                     }else{   score = calcScore(i, j, dists, clusterMap); }
 138
 139                     if (m->debug) { m->mothurOut("[DEBUG]: sample = " + toString(i) + " partition = " + toString(j) + " score = " + toString(score) + "\n"); }
 140
 141                     //is this better than our last find
 142                     if (score < minScores[i]) {
 143                         minScores[i] = score;
 144                         nextClosestPartition[i] = j;
 145                     }
 146                 }else {} //best case, ignore
 147             }
 148         }
 149
 150         return 0;
 151     }
 152     catch(exception& e) {
 153         m->errorOut(e, "KMeans", "findSecondClosest");
 154         exit(1);
 155     }
 156 }
 157 /**************************************************************************************************/
 158 double KMeans::calcScore(int sample, int partition, vector<vector<double> >& dists, map<int, int> clusterMap) {
 159     try {
 160         //square the distances and then for each pair of clusters, calculate the sum of the squraed distances between the clusters
 161         //then with the sum of hte squared dsitances take the square root and divide by the number of distances in the sum
 162
 163         double sum = 0.0; int count = 0;
 164         for (int i = 0; i < numSamples; i++) {
 165             if (m->control_pressed) { break; }
 166             if (clusterMap[i] == partition) { //samples in this cluster
 167                 sum += (dists[sample][i] * dists[sample][i]);
 168                 count++;
 169             }
 170         }
 171
 172         sum = sqrt(sum);
 173         sum /= (double) count;
 174
 175         return sum;
 176     }
 177     catch(exception& e) {
 178         m->errorOut(e, "KMeans", "calcScore");
 179         exit(1);
 180     }
 181 }
 182 /**************************************************************************************************/
 183 /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
 184
 185  CHk=Bk/(k−1)/Wk/(n−k)
 186
 187  where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
 188 double KMeans::calcCHIndex(vector< vector< double> > dists){
 189     try {
 190         double CH = 0.0;
 191
 192         if (numPartitions < 2) { return CH; }
 193
 194         map<int, int> clusterMap; //map sample to partition
 195         for (int j = 0; j < numSamples; j++) {
 196             double maxValue = 0.0;
 197             for (int i = 0; i < numPartitions; i++) {
 198                 if (m->control_pressed) { return 0.0; }
 199                 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
 200                     clusterMap[j] = i;
 201                     maxValue = zMatrix[i][j];
 202                 }
 203             }
 204         }
 205
 206         double sumBetweenCluster = 0.0;
 207         double sumWithinClusters = 0.0;
 208
 209         for (int i = 0; i < numSamples; i++) { //lt
 210             for (int j = 0; j < i; j++) {
 211                 if (m->control_pressed) { return 0.0; }
 212                 int partitionI = clusterMap[i];
 213                 int partitionJ = clusterMap[j];
 214
 215                 if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk
 216                     sumWithinClusters += (dists[i][j] * dists[i][j]);
 217                 }else { //they are NOT from the same cluster so this distance is added to Bk
 218                     sumBetweenCluster += (dists[i][j] * dists[i][j]);
 219                 }
 220             }
 221         }
 222         //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' <<  (numSamples - numPartitions) << endl;
 223
 224         CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions));
 225
 226         return CH;
 227     }
 228     catch(exception& e){
 229         m->errorOut(e, "KMeans", "calcCHIndex");
 230         exit(1);
 231     }
 232 }
 233 /**************************************************************************************************/
 234
 235
 236
 237