working on get.communitytype command

author Sarah Westcott <mothur.westcott@gmail.com>

Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)

committer Sarah Westcott <mothur.westcott@gmail.com>

Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)
author Sarah Westcott <mothur.westcott@gmail.com>
Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)
committer Sarah Westcott <mothur.westcott@gmail.com>
Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)
diff --git a/communitytype.cpp b/communitytype.cpp

index f88932c469db10ef4e7ead4984319c828f2210da..aada4906860ad32da9f58cfd6ef64b47da5ee164 100644 (file)
--- a/communitytype.cpp
+++ b/communitytype.cpp
@@ -194,6 +194,8 @@ void CommunityTypeFinder::printRelAbund(string fileName, vector<string> otuNames
         }
  }
  
         }
  }
  
+
+
  /**************************************************************************************************/
  
  vector<vector<double> > CommunityTypeFinder::getHessian(){
  /**************************************************************************************************/
  
  vector<vector<double> > CommunityTypeFinder::getHessian(){
@@ -402,8 +404,23 @@ int CommunityTypeFinder::findkMeans(){
              zMatrix[i].assign(numSamples, 0);
          }
          
              zMatrix[i].assign(numSamples, 0);
          }
          
-        for(int i=0;i<numSamples;i++){
-            zMatrix[rand()%numPartitions][i] = 1;
+        //randomize samples
+        vector<int> temp;
+        for (int i = 0; i < numSamples; i++) { temp.push_back(i); }
+        random_shuffle(temp.begin(), temp.end());
+        
+        //assign each partition at least one random sample
+        int numAssignedSamples = 0;
+        for (int i = 0; i < numPartitions; i++) {
+            zMatrix[i][temp[numAssignedSamples]] = 1;
+            numAssignedSamples++;
+        }
+        
+        //assign rest of samples to partitions
+        int count = 0;
+        for(int i=numAssignedSamples;i<numSamples;i++){
+            zMatrix[count%numPartitions][temp[i]] = 1;
+            count++;
          }
          
          double maxChange = 1;
          }
          
          double maxChange = 1;
@@ -436,6 +453,7 @@ int CommunityTypeFinder::findkMeans(){
                  
                  for(int j=0;j<numOTUs;j++){
                      averageRelativeAbundance[j] /= weights[i];
                  
                  for(int j=0;j<numOTUs;j++){
                      averageRelativeAbundance[j] /= weights[i];
+                    
                      double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
                      normChange += difference * difference;
                      alphaMatrix[i][j] = averageRelativeAbundance[j];
                      double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
                      normChange += difference * difference;
                      alphaMatrix[i][j] = averageRelativeAbundance[j];
@@ -498,7 +516,6 @@ int CommunityTypeFinder::findkMeans(){
                  }
              }
          }
                  }
              }
          }
-        
          return 0;
      }
      catch(exception& e){
          return 0;
      }
      catch(exception& e){
@@ -507,6 +524,289 @@ int CommunityTypeFinder::findkMeans(){
      }
  }
  
      }
  }
  
+/**************************************************************************************************/
+//based on r function .medoid
+//results is length numOTUs and holds the distances from x of the sample in d with the min sum of distances to all other samples.
+//Basically the "best" medoid.
+//returns the sum of the distances squared
+double CommunityTypeFinder::rMedoid(vector< vector<double> > x, vector< vector<double> > d){
+    try {
+        vector<double> results; results.resize(numOTUs, 0.0);
+        double minSumDist = 1e6;
+        int minGroup = -1;
+        
+        for (int i = 0; i < d.size(); i++) {
+            if (m->control_pressed) { break; }
+            
+            double thisSum = 0.0;
+            for (int j = 0; j < d[i].size(); j++) { thisSum += d[i][j];  }
+            if (thisSum < minSumDist) {
+                minSumDist = thisSum;
+                minGroup = i;
+            }
+        }
+        
+        if (minGroup != -1) {
+            for (int i = 0; i < numOTUs; i++) {  results[i] = x[minGroup][i];  } //save minGroups relativeAbundance for each OTU
+        }else { m->mothurOut("[ERROR]: unable to find rMedoid group.\n"); m->control_pressed = true; }
+        
+        
+        double allMeanDist = 0.0;
+        for (int i = 0; i < x.size(); i++) { //numSamples
+            for (int j = 0; j < x[i].size(); j++) { //numOTus
+                if (m->control_pressed) { break; }
+                allMeanDist += ((x[i][j]-results[j])*(x[i][j]-results[j])); //(otuX sampleY - otuX bestMedoid)^2
+                
+            }
+        }
+        return allMeanDist;
+    }
+    catch(exception& e){
+        m->errorOut(e, "CommunityTypeFinder", "rMedoid");
+        exit(1);
+    }
+}
+/**************************************************************************************************/
  
  
+/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
+ 
+ CHk=Bk/(k−1)/Wk/(n−k)
+ 
+ where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
+double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){
+    try {
+        double CH = 0.0;
+        
+        if (numPartitions < 2) { return CH; }
+        
+        map<int, int> clusterMap; //map sample to partition
+        for (int j = 0; j < numSamples; j++) {
+            double maxValue = -1e6;
+            for (int i = 0; i < numPartitions; i++) {
+                if (m->control_pressed) { return 0.0; }
+                if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
+                    clusterMap[j] = i;
+                    maxValue = zMatrix[i][j];
+                }
+            }
+        }
+        
+        //make countMatrix a relabund
+        vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
+        //get relative abundance
+        for(int i=0;i<numSamples;i++){
+            if (m->control_pressed) {  return 0; }
+            int groupTotal = 0;
+            
+            relativeAbundance[i].assign(numOTUs, 0.0);
+            
+            for(int j=0;j<numOTUs;j++){
+                groupTotal += countMatrix[i][j];
+            }
+            for(int j=0;j<numOTUs;j++){
+                relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
+            }
+        }
+        
+        //find centers
+        vector<vector<double> > centers = calcCenters(dists, clusterMap, relativeAbundance);
+        
+        if (m->control_pressed) { return 0.0; }
+        
+        double allMeanDist = rMedoid(relativeAbundance, dists);
+        
+        if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
+        
+        for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
+            for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
+                if (m->control_pressed) {  return 0; }
+                //x <- (x - centers[cl, ])^2
+                relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
+            }
+        }
+        
+        double wgss = 0.0;
+        for (int j = 0; j < numOTUs; j++) {
+            for(int i=0;i<numSamples;i++){
+                if (m->control_pressed) { return 0.0; }
+                wgss += relativeAbundance[i][j];
+            }
+        }
+        
+        double bgss = allMeanDist - wgss;
+        
+        CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
+        
+        return CH;
+    }
+    catch(exception& e){
+        m->errorOut(e, "CommunityTypeFinder", "calcCHIndex");
+        exit(1);
+    }
+}
+
+
+/**************************************************************************************************/
+vector<vector<double> > CommunityTypeFinder::calcCenters(vector<vector<double> >& dists, map<int, int> clusterMap, vector<vector<double> >& relativeAbundance) { //[numsamples][numsamples]
+    try {
+        //for each partition
+        //choose sample with smallest sum of squared dists
+        //       cout << "Here" << clusterMap.size() << endl;
+        //       for(map<int, int>::iterator it = clusterMap.begin(); it != clusterMap.end(); it++) { cout << it->first << '\t' << it->second <<endl; }
+        vector<vector<double> > centers; centers.resize(numPartitions);
+        vector<double>  sums;  sums.resize(numSamples, 0.0);
+        map<int, vector<int> > partition2Samples; //maps partitions to samples in the partition
+        map<int, vector<int> >::iterator it;
+        
+        for (int i = 0; i < numSamples; i++) {
+            int partitionI = clusterMap[i];
+            
+            //add this sample to list of samples in this partition for access later
+            it = partition2Samples.find(partitionI);
+            if (it == partition2Samples.end()) {
+                vector<int> temp; temp.push_back(i);
+                partition2Samples[partitionI] = temp;
+            }else {  partition2Samples[partitionI].push_back(i); }
+            
+            for (int j = 0; j < numSamples; j++) {
+                
+                int partitionJ = clusterMap[j];
+                
+                if (partitionI == partitionJ) { //if you are a distance between samples in the same cluster
+                    sums[i] += dists[i][j];
+                    sums[j] += dists[i][j];
+                }else{}//we dont' care about distance between clusters
+            }
+        }
+        
+        vector<int> medoidsVector; medoidsVector.resize(numPartitions, -1);
+        for (it = partition2Samples.begin(); it != partition2Samples.end(); it++) { //for each partition look for sample with smallest squared
+            
+            //sum dist to all other samples in cluster
+            vector<int> members = it->second;
+            double minSumDist = 1e6;
+            for (int i = 0; i < members.size(); i++) {
+                if (m->control_pressed) { return centers; }
+                if (sums[members[i]] < minSumDist) {
+                    minSumDist = sums[members[i]];
+                    medoidsVector[it->first] = members[i];
+                }
+            }
+            
+        }
+        
+        set<int> medoids;
+        for (int i = 0; i < medoidsVector.size(); i++) {
+            medoids.insert(medoidsVector[i]);
+        }
+        
+        int countPartitions = 0;
+        for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
+            for (int j = 0; j < numOTUs; j++) {
+                centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU
+            }
+            countPartitions++;
+        }
+        
+        return centers;
+    }
+    catch(exception& e){
+        m->errorOut(e, "CommunityTypeFinder", "calcCenters");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+//The silhouette width S(i)of individual data points i is calculated using the following formula:
+/*
+ s(i) = b(i) - a(i)
+ -----------
+ max(b(i),a(i))
+ where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
+ 
+ The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
+ */
+//based on silouette.r which calls sildist.c written by Francois Romain
+vector<double> CommunityTypeFinder::calcSilhouettes(vector<vector<double> > dists) {
+    try {
+        vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
+        if (numPartitions < 2) { return silhouettes; }
+        
+        
+        map<int, int> clusterMap; //map sample to partition
+        for (int j = 0; j < numSamples; j++) {
+            double maxValue = 0.0;
+            for (int i = 0; i < numPartitions; i++) {
+                if (m->control_pressed) { return silhouettes; }
+                if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
+                    clusterMap[j] = i;
+                    maxValue = zMatrix[i][j];
+                }
+            }
+        }
+        
+        //count number of samples in each partition
+        vector<int> counts; counts.resize(numPartitions, 0);
+        vector<double> DiC; DiC.resize((numPartitions*numSamples), 0.0);
+        bool computeSi = true;
+        
+        for (int i = 0; i < numSamples; i++) {
+            int partitionI = clusterMap[i];
+            counts[partitionI]++;
+            
+            for (int j = i+1; j < numSamples; j++) {
+                if (m->control_pressed) { return silhouettes; }
+                int partitionJ = clusterMap[j];
+                
+                DiC[numPartitions*i+partitionJ] += dists[i][j];
+                DiC[numPartitions*j+partitionI] += dists[i][j];
+            }
+        }
+        
+        vector<int> neighbor; neighbor.resize(numSamples, -1);
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return silhouettes; }
+            int ki = numPartitions*i;
+            int partitionI = clusterMap[i];
+            computeSi = true;
+            
+            for (int j = 0; j < numPartitions; j++) {
+                if (j == partitionI) {
+                    if (counts[j] == 1) { //only one sample in cluster
+                        computeSi = false;
+                    }else { DiC[ki+j] /= (counts[j]-1); }
+                }else{
+                    DiC[ki+j] /= counts[j];
+                }
+            }
+            
+            double ai = DiC[ki+partitionI];
+            
+            double bi = 0.0;
+            if (partitionI == 0) {  bi = DiC[ki+1]; neighbor[i] = 2; }
+            else {  bi =  DiC[ki]; neighbor[i] = 1; }
+            
+            for (int j = 1; j < numPartitions; j++) {
+                if (j != partitionI) {
+                    if (bi > DiC[ki+j]) {
+                        bi = DiC[ki + j];
+                        neighbor[i] = j+1;
+                    }
+                }
+            }
+            
+            silhouettes[i] = 0.0;
+            if (computeSi && bi != ai) {
+                silhouettes[i] = (bi-ai) / (max(ai, bi));
+            }
+        }
+        
+        return silhouettes;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "CommunityTypeFinder", "calcSilhouettes");
+        exit(1);
+    }
+}
  /**************************************************************************************************/
  
  /**************************************************************************************************/
  
diff --git a/communitytype.h b/communitytype.h

index 59291f5444f8e00abd4f55a9986f853b285fe8b5..606768919bd809c63d7f720dcd832826b801b849 100644 (file)
--- a/communitytype.h
+++ b/communitytype.h
@@ -35,8 +35,8 @@ public:
      virtual double getLogDet()  {    return logDeterminant; }
      virtual double getLaplace() {    return laplace;        }
      
      virtual double getLogDet()  {    return logDeterminant; }
      virtual double getLaplace() {    return laplace;        }
      
-    virtual double calcCHIndex(vector< vector< double> >) {return 0;}  //Calinski-Harabasz
-    virtual vector<double> calcSilhouettes(vector< vector< double> >) {  vector<double> s; return s; } //if none provided by child class
+    virtual double calcCHIndex(vector< vector< double> >); //Calinski-Harabasz
+    virtual vector<double> calcSilhouettes(vector< vector< double> >); 
  
  
  protected:
  
  
  protected:
@@ -46,6 +46,8 @@ protected:
      double psi1(double);
      double psi(double);
      double cheb_eval(const double[], int, double);
      double psi1(double);
      double psi(double);
      double cheb_eval(const double[], int, double);
+    double rMedoid(vector< vector<double> > x, vector< vector<double> > d);
+    vector<vector<double> > calcCenters(vector<vector<double> >&, map<int, int>, vector<vector<double> >&);
  
      
         MothurOut* m;
  
      
         MothurOut* m;
diff --git a/getmetacommunitycommand.cpp b/getmetacommunitycommand.cpp

index de59edf5204e20ab3c3541b767e50b91cbe5df64..2b944153a92e5c919a0e2269916f0ffbbc9d2395 100644 (file)
--- a/getmetacommunitycommand.cpp
+++ b/getmetacommunitycommand.cpp
@@ -611,6 +611,12 @@ int GetMetaCommunityCommand::processDriver(vector<SharedRAbundVector*>& thislook
                  finder = new qFinderDMM(sharedMatrix, numPartitions);
              }
              
                  finder = new qFinderDMM(sharedMatrix, numPartitions);
              }
              
+            string relabund = relabunds[i];
+            string matrixName = matrix[i];
+            outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName);
+            
+            finder->printZMatrix(matrixName, thisGroups);
+            
              double chi; vector<double> silhouettes;
              if (method == "dmm") {
                  double laplace = finder->getLaplace();
              double chi; vector<double> silhouettes;
              if (method == "dmm") {
                  double laplace = finder->getLaplace();
@@ -627,11 +633,6 @@ int GetMetaCommunityCommand::processDriver(vector<SharedRAbundVector*>& thislook
                      minSilhouettes = silhouettes;
                  }
              }
                      minSilhouettes = silhouettes;
                  }
              }
-            string relabund = relabunds[i];
-            string matrixName = matrix[i];
-            outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName);
-            
-            finder->printZMatrix(matrixName, thisGroups);
              
              if (method == "dmm") {
                  finder->printFitData(cout, minLaplace);
              
              if (method == "dmm") {
                  finder->printFitData(cout, minLaplace);
diff --git a/kmeans.cpp b/kmeans.cpp

index dab5c7d6b76c8c95c97defbd4656a554ef33507a..ed4561ca2e3e116c0615be88275a7212598cf692 100644 (file)
--- a/kmeans.cpp
+++ b/kmeans.cpp
@@ -25,212 +25,6 @@ KMeans::KMeans(vector<vector<int> > cm, int p) : CommunityTypeFinder() {
         }
  }
  /**************************************************************************************************/
         }
  }
  /**************************************************************************************************/
-//The silhouette width S(i)of individual data points i is calculated using the following formula:
-/*
- s(i) = b(i) - a(i)
- -----------
- max(b(i),a(i))
- where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
- 
- The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
- */
-
-vector<double> KMeans::calcSilhouettes(vector<vector<double> > dists) {
-    try {
-        vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
-        if (numPartitions < 2) { return silhouettes; }
-        
-        map<int, int> clusterMap; //map sample to partition
-        for (int j = 0; j < numSamples; j++) {
-            double maxValue = 0.0;
-            for (int i = 0; i < numPartitions; i++) {
-                if (m->control_pressed) { return silhouettes; }
-                if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
-                    clusterMap[j] = i;
-                    maxValue = zMatrix[i][j];
-                }
-            }
-        }
-        
-        vector<int> nextClosestPartition;
-        findSecondClosest(nextClosestPartition, dists, clusterMap);
-        
-        if (m->control_pressed) { return silhouettes; }
-        
-        vector<double> a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0);
-        
-        //calc a - all a[i] are the same in the same partition
-        for (int k = 0; k < numPartitions; k++) {
-            if (m->control_pressed) { break; }
-            
-            int count = 0;
-            double totalA = 0.0;
-            for (int i = 0; i < numSamples; i++) {
-                for (int j = 0; j < numSamples; j++) {
-                    if (m->control_pressed) { break; }
-                    if ((clusterMap[i] == k) && (clusterMap[j] == k)){ //are both samples in the partition, if so add there distance
-                        totalA += dists[j][i]; //distance from this sample to the other samples in the partition
-                        count++;
-                    }
-                }
-            }
-            totalA /= (double) count;
-            
-            //set a[i] to average for cluster
-            for (int i = 0; i < numSamples; i++) {
-                if (clusterMap[i] == k) { a[i] = totalA; }
-            }
-        }
-        
-        //calc b
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            int thisPartition = nextClosestPartition[i];
-            int count = 0;
-            double totalB = 0.0;
-            for (int j = 0; j < numSamples; j++) {
-                if (clusterMap[j] == thisPartition) { //this sample is in this partition
-                    totalB += dists[i][j];
-                    count++;
-                }
-            }
-            b[i] = totalB / (double) count;
-        }
-        
-        //calc silhouettes
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            double denom = a[i];
-            if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i])
-            
-            silhouettes[i] = (b[i] - a[i]) / denom;
-            
-            //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl;
-        }
-        
-        return silhouettes;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "KMeans", "calcSilhouettes");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
-int KMeans::findSecondClosest(vector<int>& nextClosestPartition, vector<vector<double> >& dists, map<int, int> clusterMap) {
-    try {
-        vector<double> minScores; minScores.resize(numSamples, 1e6);
-        nextClosestPartition.resize(numSamples, 0);
-        
-        
-        for (int i = 0; i < numSamples; i++) {
-            for (int j = 0; j < numPartitions; j++) {
-                if (m->control_pressed) { break; }
-                
-                //is this the one we are assigned to - ie the "best" cluster. We want second best.
-                //if numPartitions = 2, then special case??
-                if (clusterMap[i] != j) {
-                    double score = 1e6;
-                    if (numPartitions == 2) {
-                        score = 0.0;  //choose other option, there are only 2.
-                    }else{   score = calcScore(i, j, dists, clusterMap); }
-                
-                    if (m->debug) { m->mothurOut("[DEBUG]: sample = " + toString(i) + " partition = " + toString(j) + " score = " + toString(score) + "\n"); }
-                    
-                    //is this better than our last find
-                    if (score < minScores[i]) {
-                        minScores[i] = score;
-                        nextClosestPartition[i] = j;
-                    }
-                }else {} //best case, ignore
-            }
-        }
-        
-        return 0;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "KMeans", "findSecondClosest");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
-double KMeans::calcScore(int sample, int partition, vector<vector<double> >& dists, map<int, int> clusterMap) {
-    try {
-        //square the distances and then for each pair of clusters, calculate the sum of the squraed distances between the clusters
-        //then with the sum of hte squared dsitances take the square root and divide by the number of distances in the sum
-        
-        double sum = 0.0; int count = 0;
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            if (clusterMap[i] == partition) { //samples in this cluster
-                sum += (dists[sample][i] * dists[sample][i]);
-                count++;
-            }
-        }
-        
-        sum = sqrt(sum);
-        sum /= (double) count;
-                
-        return sum;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "KMeans", "calcScore");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
-/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
- 
- CHk=Bk/(k−1)/Wk/(n−k)
- 
- where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
-double KMeans::calcCHIndex(vector< vector< double> > dists){
-    try {
-        double CH = 0.0;
-        
-        if (numPartitions < 2) { return CH; }
-        
-        map<int, int> clusterMap; //map sample to partition
-        for (int j = 0; j < numSamples; j++) {
-            double maxValue = 0.0;
-            for (int i = 0; i < numPartitions; i++) {
-                if (m->control_pressed) { return 0.0; }
-                if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
-                    clusterMap[j] = i;
-                    maxValue = zMatrix[i][j];
-                }
-            }
-        }
-        
-        double sumBetweenCluster = 0.0;
-        double sumWithinClusters = 0.0;
-        
-        for (int i = 0; i < numSamples; i++) { //lt
-            for (int j = 0; j < i; j++) {
-                if (m->control_pressed) { return 0.0; }
-                int partitionI = clusterMap[i];
-                int partitionJ = clusterMap[j];
-                
-                if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk
-                    sumWithinClusters += (dists[i][j] * dists[i][j]);
-                }else { //they are NOT from the same cluster so this distance is added to Bk
-                    sumBetweenCluster += (dists[i][j] * dists[i][j]);
-                }
-            }
-        }
-        //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' <<  (numSamples - numPartitions) << endl;
-        
-        CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions));
-        
-        return CH;
-    }
-    catch(exception& e){
-        m->errorOut(e, "KMeans", "calcCHIndex");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
  
  
  
  
  
  
diff --git a/kmeans.h b/kmeans.h

index 97dfb681f49d865708a044a06a5a46e91bae8f5c..eba72691423ae17139aede984600afd31ed8054e 100644 (file)
--- a/kmeans.h
+++ b/kmeans.h
@@ -17,13 +17,9 @@ class KMeans : public CommunityTypeFinder {
      
  public:
      KMeans(vector<vector<int> >, int);
      
  public:
      KMeans(vector<vector<int> >, int);
-    vector<double> calcSilhouettes(vector< vector< double> >);
-    double calcCHIndex(vector< vector< double> >);
      
  private:
  
      
  private:
  
-    int findSecondClosest(vector<int>&, vector<vector<double> >&, map<int, int>);
-    double calcScore(int sample, int partition, vector<vector<double> >&, map<int, int>);
  
  };
  
  
  };
  
diff --git a/pam.cpp b/pam.cpp

index bcc51fc7e87e93c6ed67b741b4e0335ea380327f..b09c2a09d1e4a5bcf867ceef7013f5eb898b6f53 100644 (file)
--- a/pam.cpp
+++ b/pam.cpp
@@ -229,102 +229,14 @@ int Pam::updateDp() {
          exit(1);
      }
  }
          exit(1);
      }
  }
-/**************************************************************************************************/
-//The silhouette width S(i)of individual data points i is calculated using the following formula:
-/*
- s(i) = b(i) - a(i)
- -----------
- max(b(i),a(i))
- where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
- 
- The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
- */
  
  
-vector<double> Pam::calcSilhouettes(vector<vector<double> > dists) {
-    try {
-        vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
-        if (numPartitions < 2) { return silhouettes; }
-        
-        vector<int> whoDp;
-        for (int i = 0; i < numSamples; i++) { // assumes at least 2 partitions
-            vector<seqDist> tempMeds;
-            for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
-                if (m->control_pressed) { break; }
-                seqDist temp(*it, *it, dists[i][*it]); //medoid, medoid, distance from medoid to sample
-                tempMeds.push_back(temp);
-            }
-            sort(tempMeds.begin(), tempMeds.end(), compareSequenceDistance); //sort by distance
-            whoDp.push_back(tempMeds[1].seq1); //second closest medoid
-        }
-        
-        
-        vector<double> a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0);
-        
-        //calc a - all a[i] are the same in the same partition
-        for (int k = 0; k < numPartitions; k++) {
-            if (m->control_pressed) { break; }
-            
-            int count = 0;
-            double totalA = 0.0;
-            for (int i = 0; i < numSamples; i++) {
-                for (int j = 0; j < numSamples; j++) {
-                    if (m->control_pressed) { break; }
-                    if ((zMatrix[k][i] != 0) && (zMatrix[k][j] != 0)){ //are both samples in the partition, if so add there distance
-                        totalA += dists[j][i]; //distance from this sample to the other samples in the partition
-                        count++;
-                    }
-                }
-            }
-            totalA /= (double) count;
-            
-            //set a[i] to average for cluster
-            for (int i = 0; i < numSamples; i++) {
-                if (zMatrix[k][i] != 0) { a[i] = totalA; }
-            }
-        }
-        
-        //calc b
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            int nextClosestMedoid = whoDp[i];
-            int thisPartition = medoid2Partition[nextClosestMedoid];
-            int count = 0;
-            double totalB = 0.0;
-            for (int j = 0; j < numSamples; j++) {
-                if (zMatrix[thisPartition][j] != 0) { //this sample is in this partition
-                    totalB += dists[i][j];
-                    count++;
-                }
-            }
-            b[i] = totalB / (double) count;
-        }
-        
-        //calc silhouettes
-        for (int i = 0; i < numSamples; i++) {
-            if (m->control_pressed) { break; }
-            
-            double denom = a[i];
-            if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i])
-            
-            silhouettes[i] = (b[i] - a[i]) / denom;
-            
-            //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl;
-        }
-               
-        return silhouettes;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "Pam", "calcSilhouettes");
-        exit(1);
-    }
-}
  /**************************************************************************************************/
  /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
   
   CHk=Bk/(k−1)/Wk/(n−k)
   
   where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
  /**************************************************************************************************/
  /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
   
   CHk=Bk/(k−1)/Wk/(n−k)
   
   where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
+//based on R index.G1.r function
  double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSamples][numOtus]
      try {
          double CH = 0.0;
  double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSamples][numOtus]
      try {
          double CH = 0.0;
@@ -340,7 +252,7 @@ double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSam
          }
          
          //make countMatrix a relabund
          }
          
          //make countMatrix a relabund
-        vector<vector<double> > relativeAbundance(numSamples);
+        vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
          //get relative abundance
          for(int i=0;i<numSamples;i++){
              if (m->control_pressed) {  return 0; }
          //get relative abundance
          for(int i=0;i<numSamples;i++){
              if (m->control_pressed) {  return 0; }
@@ -366,25 +278,32 @@ double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSam
              countPartitions++;
          }
          
              countPartitions++;
          }
          
-        double sumBetweenCluster = 0.0;
-        double sumWithinClusters = 0.0;
+        //centers.clear();
+        //centers = calcCenters(dists, clusterMap, relativeAbundance);
+        
+        double allMeanDist = rMedoid(relativeAbundance, dists);
+        
+        if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
+        
+        for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
+            for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
+                if (m->control_pressed) {  return 0; }
+                //x <- (x - centers[cl, ])^2
+                relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
+            }
+        }
          
          
-        for (int i = 0; i < numSamples; i++) { //lt
-            for (int j = 0; j < i; j++) {
+        double wgss = 0.0;
+        for (int j = 0; j < numOTUs; j++) {
+            for(int i=0;i<numSamples;i++){
                  if (m->control_pressed) { return 0.0; }
                  if (m->control_pressed) { return 0.0; }
-                int partitionI = clusterMap[i];
-                int partitionJ = clusterMap[j];
-                
-                if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk
-                    sumWithinClusters += (dists[i][j] * dists[i][j]);
-                }else { //they are NOT from the same cluster so this distance is added to Bk
-                    sumBetweenCluster += (dists[i][j] * dists[i][j]);
-                }
+                wgss += relativeAbundance[i][j];
              }
          }
              }
          }
-        //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' <<  (numSamples - numPartitions) << endl;
          
          
-        CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions));
+        double bgss = allMeanDist - wgss;
+        
+        CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
          
          return CH;
      }
          
          return CH;
      }
@@ -397,4 +316,3 @@ double Pam::calcCHIndex(vector< vector<double> > dists){ //countMatrix = [numSam
  /**************************************************************************************************/
  
  
  /**************************************************************************************************/
  
  
-
diff --git a/pam.h b/pam.h

index 993017663a490702280d0d4e42854cc6b608ce30..ab5c68f922ccdbb1c8b4e30d19ab3d4efd2ddb3f 100644 (file)
--- a/pam.h
+++ b/pam.h
@@ -17,7 +17,6 @@ class Pam : public CommunityTypeFinder {
      
  public:
      Pam(vector<vector<int> >, vector<vector<double> >, int);
      
  public:
      Pam(vector<vector<int> >, vector<vector<double> >, int);
-    vector<double> calcSilhouettes(vector< vector< double> >);
      double calcCHIndex(vector< vector< double> >);
      
  private:
      double calcCHIndex(vector< vector< double> >);
      
  private:
author	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)
committer	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 29 Jan 2014 21:03:17 +0000 (16:03 -0500)
communitytype.cpp		patch \| blob \| history
communitytype.h		patch \| blob \| history
getmetacommunitycommand.cpp		patch \| blob \| history
kmeans.cpp		patch \| blob \| history
kmeans.h		patch \| blob \| history
pam.cpp		patch \| blob \| history
pam.h		patch \| blob \| history