fixes while testing 1.33.0

[mothur.git] / pam.cpp
diff --git a/pam.cpp b/pam.cpp

index 2c70f58f5770135d93e97dbe06a7392816b37197..98b2113238f099dfb188a2235e87be0c76d230f4 100644 (file)
--- a/pam.cpp
+++ b/pam.cpp
@@ -25,7 +25,7 @@ Pam::Pam(vector<vector<int> > c, vector<vector<double> > d, int p) : CommunityTy
                  if (dists[i][j] > largestDist) { largestDist = dists[i][j]; } 
              }
          }
-        
+       
          buildPhase(); //choosing the medoids
          swapPhase(); //optimize clusters
      }
@@ -35,6 +35,7 @@ Pam::Pam(vector<vector<int> > c, vector<vector<double> > d, int p) : CommunityTy
         }
  }
  /**************************************************************************************************/
+//build and swap functions based on pam.c by maechler from R cluster package
  //sets Dp[0] does not set Dp[1]. chooses intial medoids.
  int Pam::buildPhase() {
      try {
@@ -65,12 +66,11 @@ int Pam::buildPhase() {
                      gains[i] = 0.0;
                  
                      for (int j = 0; j < numSamples; j++) {
-                        //cout << i << '\t' << j << '\t' <<   Dp[i][0] << '\t' << dists[i][j] << '\t' << totalGain << endl;
-                        totalGain = Dp[i][0] - dists[i][j];
+                        totalGain = Dp[j][0] - dists[i][j];
                          if (totalGain > 0.0) { gains[i] += totalGain; }
                      }
                      if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +  " totalGain = " + toString(totalGain) + "\n"); }
-                    
+                   
                      if (clusterGain <= gains[i]) {
                          clusterGain = gains[i];
                          medoid = i;
@@ -104,7 +104,7 @@ int Pam::swapPhase() {
          //calculate cost of initial choice - average distance of samples to their closest medoid
          double sky = 0.0;
          double dzsky = 1.0;
-        for (int i = 0; i < numSamples; i++) { sky += Dp[i][0]; }  sky /= (double) numSamples;
+        for (int i = 0; i < numSamples; i++) { sky += Dp[i][0]; }  //sky /= (double) numSamples;
          
          bool done = false;
          int hbest, nbest; hbest = -1; nbest = -1;
@@ -128,9 +128,9 @@ int Pam::swapPhase() {
                                      double small = 0.0;
                                      if (Dp[j][1] > dists[h][j]) {   small = dists[h][j];    }
                                      else                        {   small = Dp[j][1];       }
-                                    dz += (small - Dp[j][0]);
+                                    dz += (- Dp[j][0]+ small);
                                  }else if (dists[h][j] < Dp[j][0]) {
-                                    dz += (dists[h][j] - Dp[j][0]);
+                                    dz += (- Dp[j][0] + dists[h][j]);
                                  }
                              }
                              if (dzsky > dz) {
@@ -222,6 +222,7 @@ int Pam::updateDp() {
                  }
              }
          }
+    
          return 0;
      }
      catch(exception& e) {
@@ -229,102 +230,14 @@ int Pam::updateDp() {
          exit(1);
      }
  }
-/**************************************************************************************************/
-//The silhouette width S(i)of individual data points i is calculated using the following formula:
-/*
- s(i) = b(i) - a(i)
- -----------
- max(b(i),a(i))
- where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
- 
- The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
- */
  
-vector<double> Pam::calcSilhouettes(vector<vector<double> > dists) {
-    try {
-        vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
-        if (numPartitions < 2) { return silhouettes; }
-        
-        vector<int> whoDp;
-        for (int i = 0; i < numSamples; i++) { // assumes at least 2 partitions
-            vector<seqDist> tempMeds;
-            for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
-                if (m->control_pressed) { break; }
-                seqDist temp(*it, *it, dists[i][*it]); //medoid, medoid, distance from medoid to sample
-                tempMeds.push_back(temp);
-            }
-            sort(tempMeds.begin(), tempMeds.end(), compareSequenceDistance); //sort by distance
-            whoDp.push_back(tempMeds[1].seq1); //second closest medoid
-        }
-        
-        
-        vector<double> a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0);
-        
-        //calc a - all a[i] are the same in the same partition
-        for (int k = 0; k < numPartitions; k++) {
-            if (m->control_pressed) { break; }
-            
-            int count = 0;
-            double totalA = 0.0;
-            for (int i = 0; i < numSamples; i++) {
-                for (int j = 0; j < numSamples; j++) {
-                    if (m->control_pressed) { break; }
-                    if ((zMatrix[k][i] != 0) && (zMatrix[k][j] != 0)){ //are both samples in the partition, if so add there distance
-                        totalA += dists[j][i]; //distance from this sample to the other samples in the partition
-                        count++;
-                    }
-                }
-            }
-            totalA /= (double) count;
-            
-            //set a[i] to average for cluster
-            for (int i = 0; i < numSamples; i++) {
-                if (zMatrix[k][i] != 0) { a[i] = totalA; }
-            }
-        }
-        
-        //calc b
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            int nextClosestMedoid = whoDp[i];
-            int thisPartition = medoid2Partition[nextClosestMedoid];
-            int count = 0;
-            double totalB = 0.0;
-            for (int j = 0; j < numSamples; j++) {
-                if (zMatrix[thisPartition][j] != 0) { //this sample is in this partition
-                    totalB += dists[i][j];
-                    count++;
-                }
-            }
-            b[i] = totalB / (double) count;
-        }
-        
-        //calc silhouettes
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            double denom = a[i];
-            if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i])
-            
-            silhouettes[i] = (b[i] - a[i]) / denom;
-            
-            //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl;
-        }
-               
-        return silhouettes;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "Pam", "calcSilhouettes");
-        exit(1);
-    }
-}
  /**************************************************************************************************/
  /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
   
   CHk=Bk/(k−1)/Wk/(n−k)
   
   where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
+//based on R index.G1.r function
  double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSamples][numOtus]
      try {
          double CH = 0.0;
@@ -339,25 +252,59 @@ double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSam
              }
          }
          
-        double sumBetweenCluster = 0.0;
-        double sumWithinClusters = 0.0;
+        //make countMatrix a relabund
+        vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
+        //get relative abundance
+        for(int i=0;i<numSamples;i++){
+            if (m->control_pressed) {  return 0; }
+            int groupTotal = 0;
+            
+            relativeAbundance[i].assign(numOTUs, 0.0);
+            
+            for(int j=0;j<numOTUs;j++){
+                groupTotal += countMatrix[i][j];
+            }
+            for(int j=0;j<numOTUs;j++){
+                relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
+            }
+        }
+        
+        //find centers
+        vector<vector<double> > centers; centers.resize(numPartitions);
+        int countPartitions = 0;
+        for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
+            for (int j = 0; j < numOTUs; j++) {
+                centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU
+            }
+            countPartitions++;
+        }
+        
+        //centers.clear();
+        //centers = calcCenters(dists, clusterMap, relativeAbundance);
+        
+        double allMeanDist = rMedoid(relativeAbundance, dists);
+        
+        if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
          
-        for (int i = 0; i < numSamples; i++) { //lt
-            for (int j = 0; j < i; j++) {
+        for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
+            for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
+                if (m->control_pressed) {  return 0; }
+                //x <- (x - centers[cl, ])^2
+                relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
+            }
+        }
+        
+        double wgss = 0.0;
+        for (int j = 0; j < numOTUs; j++) {
+            for(int i=0;i<numSamples;i++){
                  if (m->control_pressed) { return 0.0; }
-                int partitionI = clusterMap[i];
-                int partitionJ = clusterMap[j];
-                
-                if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk
-                    sumWithinClusters += (dists[i][j] * dists[i][j]);
-                }else { //they are NOT from the same cluster so this distance is added to Bk
-                    sumBetweenCluster += (dists[i][j] * dists[i][j]);
-                }
+                wgss += relativeAbundance[i][j];
              }
          }
-        //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' <<  (numSamples - numPartitions) << endl;
          
-        CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions));
+        double bgss = allMeanDist - wgss;
+        
+        CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
          
          return CH;
      }
@@ -370,4 +317,3 @@ double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSam
  /**************************************************************************************************/
  
  
-