From: Sarah Westcott Date: Wed, 29 Jan 2014 21:03:17 +0000 (-0500) Subject: working on get.communitytype command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=250e3b11b1c9c1e1ad458ab6c7e71ac2e67e11d9 working on get.communitytype command --- diff --git a/communitytype.cpp b/communitytype.cpp index f88932c..aada490 100644 --- a/communitytype.cpp +++ b/communitytype.cpp @@ -194,6 +194,8 @@ void CommunityTypeFinder::printRelAbund(string fileName, vector otuNames } } + + /**************************************************************************************************/ vector > CommunityTypeFinder::getHessian(){ @@ -402,8 +404,23 @@ int CommunityTypeFinder::findkMeans(){ zMatrix[i].assign(numSamples, 0); } - for(int i=0;i temp; + for (int i = 0; i < numSamples; i++) { temp.push_back(i); } + random_shuffle(temp.begin(), temp.end()); + + //assign each partition at least one random sample + int numAssignedSamples = 0; + for (int i = 0; i < numPartitions; i++) { + zMatrix[i][temp[numAssignedSamples]] = 1; + numAssignedSamples++; + } + + //assign rest of samples to partitions + int count = 0; + for(int i=numAssignedSamples;i > x, vector< vector > d){ + try { + vector results; results.resize(numOTUs, 0.0); + double minSumDist = 1e6; + int minGroup = -1; + + for (int i = 0; i < d.size(); i++) { + if (m->control_pressed) { break; } + + double thisSum = 0.0; + for (int j = 0; j < d[i].size(); j++) { thisSum += d[i][j]; } + if (thisSum < minSumDist) { + minSumDist = thisSum; + minGroup = i; + } + } + + if (minGroup != -1) { + for (int i = 0; i < numOTUs; i++) { results[i] = x[minGroup][i]; } //save minGroups relativeAbundance for each OTU + }else { m->mothurOut("[ERROR]: unable to find rMedoid group.\n"); m->control_pressed = true; } + + + double allMeanDist = 0.0; + for (int i = 0; i < x.size(); i++) { //numSamples + for (int j = 0; j < x[i].size(); j++) { //numOTus + if (m->control_pressed) { break; } + allMeanDist += ((x[i][j]-results[j])*(x[i][j]-results[j])); //(otuX sampleY - otuX bestMedoid)^2 + + } + } + return allMeanDist; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "rMedoid"); + exit(1); + } +} +/**************************************************************************************************/ +/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: + + CHk=Bk/(k−1)/Wk/(n−k) + + where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ +double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){ + try { + double CH = 0.0; + + if (numPartitions < 2) { return CH; } + + map clusterMap; //map sample to partition + for (int j = 0; j < numSamples; j++) { + double maxValue = -1e6; + for (int i = 0; i < numPartitions; i++) { + if (m->control_pressed) { return 0.0; } + if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be + clusterMap[j] = i; + maxValue = zMatrix[i][j]; + } + } + } + + //make countMatrix a relabund + vector > relativeAbundance(numSamples); //[numSamples][numOTUs] + //get relative abundance + for(int i=0;icontrol_pressed) { return 0; } + int groupTotal = 0; + + relativeAbundance[i].assign(numOTUs, 0.0); + + for(int j=0;j > centers = calcCenters(dists, clusterMap, relativeAbundance); + + if (m->control_pressed) { return 0.0; } + + double allMeanDist = rMedoid(relativeAbundance, dists); + + if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); } + + for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples + for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus + if (m->control_pressed) { return 0; } + //x <- (x - centers[cl, ])^2 + relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j])); + } + } + + double wgss = 0.0; + for (int j = 0; j < numOTUs; j++) { + for(int i=0;icontrol_pressed) { return 0.0; } + wgss += relativeAbundance[i][j]; + } + } + + double bgss = allMeanDist - wgss; + + CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions)); + + return CH; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "calcCHIndex"); + exit(1); + } +} + + +/**************************************************************************************************/ +vector > CommunityTypeFinder::calcCenters(vector >& dists, map clusterMap, vector >& relativeAbundance) { //[numsamples][numsamples] + try { + //for each partition + //choose sample with smallest sum of squared dists + // cout << "Here" << clusterMap.size() << endl; + // for(map::iterator it = clusterMap.begin(); it != clusterMap.end(); it++) { cout << it->first << '\t' << it->second < > centers; centers.resize(numPartitions); + vector sums; sums.resize(numSamples, 0.0); + map > partition2Samples; //maps partitions to samples in the partition + map >::iterator it; + + for (int i = 0; i < numSamples; i++) { + int partitionI = clusterMap[i]; + + //add this sample to list of samples in this partition for access later + it = partition2Samples.find(partitionI); + if (it == partition2Samples.end()) { + vector temp; temp.push_back(i); + partition2Samples[partitionI] = temp; + }else { partition2Samples[partitionI].push_back(i); } + + for (int j = 0; j < numSamples; j++) { + + int partitionJ = clusterMap[j]; + + if (partitionI == partitionJ) { //if you are a distance between samples in the same cluster + sums[i] += dists[i][j]; + sums[j] += dists[i][j]; + }else{}//we dont' care about distance between clusters + } + } + + vector medoidsVector; medoidsVector.resize(numPartitions, -1); + for (it = partition2Samples.begin(); it != partition2Samples.end(); it++) { //for each partition look for sample with smallest squared + + //sum dist to all other samples in cluster + vector members = it->second; + double minSumDist = 1e6; + for (int i = 0; i < members.size(); i++) { + if (m->control_pressed) { return centers; } + if (sums[members[i]] < minSumDist) { + minSumDist = sums[members[i]]; + medoidsVector[it->first] = members[i]; + } + } + + } + + set medoids; + for (int i = 0; i < medoidsVector.size(); i++) { + medoids.insert(medoidsVector[i]); + } + + int countPartitions = 0; + for (set::iterator it = medoids.begin(); it != medoids.end(); it++) { + for (int j = 0; j < numOTUs; j++) { + centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU + } + countPartitions++; + } + + return centers; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "calcCenters"); + exit(1); + } +} + +/**************************************************************************************************/ +//The silhouette width S(i)of individual data points i is calculated using the following formula: +/* + s(i) = b(i) - a(i) + ----------- + max(b(i),a(i)) + where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster. + + The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster. + */ +//based on silouette.r which calls sildist.c written by Francois Romain +vector CommunityTypeFinder::calcSilhouettes(vector > dists) { + try { + vector silhouettes; silhouettes.resize(numSamples, 0.0); + if (numPartitions < 2) { return silhouettes; } + + + map clusterMap; //map sample to partition + for (int j = 0; j < numSamples; j++) { + double maxValue = 0.0; + for (int i = 0; i < numPartitions; i++) { + if (m->control_pressed) { return silhouettes; } + if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be + clusterMap[j] = i; + maxValue = zMatrix[i][j]; + } + } + } + + //count number of samples in each partition + vector counts; counts.resize(numPartitions, 0); + vector DiC; DiC.resize((numPartitions*numSamples), 0.0); + bool computeSi = true; + + for (int i = 0; i < numSamples; i++) { + int partitionI = clusterMap[i]; + counts[partitionI]++; + + for (int j = i+1; j < numSamples; j++) { + if (m->control_pressed) { return silhouettes; } + int partitionJ = clusterMap[j]; + + DiC[numPartitions*i+partitionJ] += dists[i][j]; + DiC[numPartitions*j+partitionI] += dists[i][j]; + } + } + + vector neighbor; neighbor.resize(numSamples, -1); + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { return silhouettes; } + int ki = numPartitions*i; + int partitionI = clusterMap[i]; + computeSi = true; + + for (int j = 0; j < numPartitions; j++) { + if (j == partitionI) { + if (counts[j] == 1) { //only one sample in cluster + computeSi = false; + }else { DiC[ki+j] /= (counts[j]-1); } + }else{ + DiC[ki+j] /= counts[j]; + } + } + + double ai = DiC[ki+partitionI]; + + double bi = 0.0; + if (partitionI == 0) { bi = DiC[ki+1]; neighbor[i] = 2; } + else { bi = DiC[ki]; neighbor[i] = 1; } + + for (int j = 1; j < numPartitions; j++) { + if (j != partitionI) { + if (bi > DiC[ki+j]) { + bi = DiC[ki + j]; + neighbor[i] = j+1; + } + } + } + + silhouettes[i] = 0.0; + if (computeSi && bi != ai) { + silhouettes[i] = (bi-ai) / (max(ai, bi)); + } + } + + return silhouettes; + } + catch(exception& e) { + m->errorOut(e, "CommunityTypeFinder", "calcSilhouettes"); + exit(1); + } +} /**************************************************************************************************/ diff --git a/communitytype.h b/communitytype.h index 59291f5..6067689 100644 --- a/communitytype.h +++ b/communitytype.h @@ -35,8 +35,8 @@ public: virtual double getLogDet() { return logDeterminant; } virtual double getLaplace() { return laplace; } - virtual double calcCHIndex(vector< vector< double> >) {return 0;} //Calinski-Harabasz - virtual vector calcSilhouettes(vector< vector< double> >) { vector s; return s; } //if none provided by child class + virtual double calcCHIndex(vector< vector< double> >); //Calinski-Harabasz + virtual vector calcSilhouettes(vector< vector< double> >); protected: @@ -46,6 +46,8 @@ protected: double psi1(double); double psi(double); double cheb_eval(const double[], int, double); + double rMedoid(vector< vector > x, vector< vector > d); + vector > calcCenters(vector >&, map, vector >&); MothurOut* m; diff --git a/getmetacommunitycommand.cpp b/getmetacommunitycommand.cpp index de59edf..2b94415 100644 --- a/getmetacommunitycommand.cpp +++ b/getmetacommunitycommand.cpp @@ -611,6 +611,12 @@ int GetMetaCommunityCommand::processDriver(vector& thislook finder = new qFinderDMM(sharedMatrix, numPartitions); } + string relabund = relabunds[i]; + string matrixName = matrix[i]; + outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName); + + finder->printZMatrix(matrixName, thisGroups); + double chi; vector silhouettes; if (method == "dmm") { double laplace = finder->getLaplace(); @@ -627,11 +633,6 @@ int GetMetaCommunityCommand::processDriver(vector& thislook minSilhouettes = silhouettes; } } - string relabund = relabunds[i]; - string matrixName = matrix[i]; - outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName); - - finder->printZMatrix(matrixName, thisGroups); if (method == "dmm") { finder->printFitData(cout, minLaplace); diff --git a/kmeans.cpp b/kmeans.cpp index dab5c7d..ed4561c 100644 --- a/kmeans.cpp +++ b/kmeans.cpp @@ -25,212 +25,6 @@ KMeans::KMeans(vector > cm, int p) : CommunityTypeFinder() { } } /**************************************************************************************************/ -//The silhouette width S(i)of individual data points i is calculated using the following formula: -/* - s(i) = b(i) - a(i) - ----------- - max(b(i),a(i)) - where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster. - - The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster. - */ - -vector KMeans::calcSilhouettes(vector > dists) { - try { - vector silhouettes; silhouettes.resize(numSamples, 0.0); - if (numPartitions < 2) { return silhouettes; } - - map clusterMap; //map sample to partition - for (int j = 0; j < numSamples; j++) { - double maxValue = 0.0; - for (int i = 0; i < numPartitions; i++) { - if (m->control_pressed) { return silhouettes; } - if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be - clusterMap[j] = i; - maxValue = zMatrix[i][j]; - } - } - } - - vector nextClosestPartition; - findSecondClosest(nextClosestPartition, dists, clusterMap); - - if (m->control_pressed) { return silhouettes; } - - vector a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0); - - //calc a - all a[i] are the same in the same partition - for (int k = 0; k < numPartitions; k++) { - if (m->control_pressed) { break; } - - int count = 0; - double totalA = 0.0; - for (int i = 0; i < numSamples; i++) { - for (int j = 0; j < numSamples; j++) { - if (m->control_pressed) { break; } - if ((clusterMap[i] == k) && (clusterMap[j] == k)){ //are both samples in the partition, if so add there distance - totalA += dists[j][i]; //distance from this sample to the other samples in the partition - count++; - } - } - } - totalA /= (double) count; - - //set a[i] to average for cluster - for (int i = 0; i < numSamples; i++) { - if (clusterMap[i] == k) { a[i] = totalA; } - } - } - - //calc b - for (int i = 0; i < numSamples; i++) { - if (m->control_pressed) { break; } - - int thisPartition = nextClosestPartition[i]; - int count = 0; - double totalB = 0.0; - for (int j = 0; j < numSamples; j++) { - if (clusterMap[j] == thisPartition) { //this sample is in this partition - totalB += dists[i][j]; - count++; - } - } - b[i] = totalB / (double) count; - } - - //calc silhouettes - for (int i = 0; i < numSamples; i++) { - if (m->control_pressed) { break; } - - double denom = a[i]; - if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i]) - - silhouettes[i] = (b[i] - a[i]) / denom; - - //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl; - } - - return silhouettes; - } - catch(exception& e) { - m->errorOut(e, "KMeans", "calcSilhouettes"); - exit(1); - } -} -/**************************************************************************************************/ -int KMeans::findSecondClosest(vector& nextClosestPartition, vector >& dists, map clusterMap) { - try { - vector minScores; minScores.resize(numSamples, 1e6); - nextClosestPartition.resize(numSamples, 0); - - - for (int i = 0; i < numSamples; i++) { - for (int j = 0; j < numPartitions; j++) { - if (m->control_pressed) { break; } - - //is this the one we are assigned to - ie the "best" cluster. We want second best. - //if numPartitions = 2, then special case?? - if (clusterMap[i] != j) { - double score = 1e6; - if (numPartitions == 2) { - score = 0.0; //choose other option, there are only 2. - }else{ score = calcScore(i, j, dists, clusterMap); } - - if (m->debug) { m->mothurOut("[DEBUG]: sample = " + toString(i) + " partition = " + toString(j) + " score = " + toString(score) + "\n"); } - - //is this better than our last find - if (score < minScores[i]) { - minScores[i] = score; - nextClosestPartition[i] = j; - } - }else {} //best case, ignore - } - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "KMeans", "findSecondClosest"); - exit(1); - } -} -/**************************************************************************************************/ -double KMeans::calcScore(int sample, int partition, vector >& dists, map clusterMap) { - try { - //square the distances and then for each pair of clusters, calculate the sum of the squraed distances between the clusters - //then with the sum of hte squared dsitances take the square root and divide by the number of distances in the sum - - double sum = 0.0; int count = 0; - for (int i = 0; i < numSamples; i++) { - if (m->control_pressed) { break; } - if (clusterMap[i] == partition) { //samples in this cluster - sum += (dists[sample][i] * dists[sample][i]); - count++; - } - } - - sum = sqrt(sum); - sum /= (double) count; - - return sum; - } - catch(exception& e) { - m->errorOut(e, "KMeans", "calcScore"); - exit(1); - } -} -/**************************************************************************************************/ -/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: - - CHk=Bk/(k−1)/Wk/(n−k) - - where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ -double KMeans::calcCHIndex(vector< vector< double> > dists){ - try { - double CH = 0.0; - - if (numPartitions < 2) { return CH; } - - map clusterMap; //map sample to partition - for (int j = 0; j < numSamples; j++) { - double maxValue = 0.0; - for (int i = 0; i < numPartitions; i++) { - if (m->control_pressed) { return 0.0; } - if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be - clusterMap[j] = i; - maxValue = zMatrix[i][j]; - } - } - } - - double sumBetweenCluster = 0.0; - double sumWithinClusters = 0.0; - - for (int i = 0; i < numSamples; i++) { //lt - for (int j = 0; j < i; j++) { - if (m->control_pressed) { return 0.0; } - int partitionI = clusterMap[i]; - int partitionJ = clusterMap[j]; - - if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk - sumWithinClusters += (dists[i][j] * dists[i][j]); - }else { //they are NOT from the same cluster so this distance is added to Bk - sumBetweenCluster += (dists[i][j] * dists[i][j]); - } - } - } - //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' << (numSamples - numPartitions) << endl; - - CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions)); - - return CH; - } - catch(exception& e){ - m->errorOut(e, "KMeans", "calcCHIndex"); - exit(1); - } -} -/**************************************************************************************************/ diff --git a/kmeans.h b/kmeans.h index 97dfb68..eba7269 100644 --- a/kmeans.h +++ b/kmeans.h @@ -17,13 +17,9 @@ class KMeans : public CommunityTypeFinder { public: KMeans(vector >, int); - vector calcSilhouettes(vector< vector< double> >); - double calcCHIndex(vector< vector< double> >); private: - int findSecondClosest(vector&, vector >&, map); - double calcScore(int sample, int partition, vector >&, map); }; diff --git a/pam.cpp b/pam.cpp index bcc51fc..b09c2a0 100644 --- a/pam.cpp +++ b/pam.cpp @@ -229,102 +229,14 @@ int Pam::updateDp() { exit(1); } } -/**************************************************************************************************/ -//The silhouette width S(i)of individual data points i is calculated using the following formula: -/* - s(i) = b(i) - a(i) - ----------- - max(b(i),a(i)) - where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster. - - The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster. - */ -vector Pam::calcSilhouettes(vector > dists) { - try { - vector silhouettes; silhouettes.resize(numSamples, 0.0); - if (numPartitions < 2) { return silhouettes; } - - vector whoDp; - for (int i = 0; i < numSamples; i++) { // assumes at least 2 partitions - vector tempMeds; - for (set::iterator it = medoids.begin(); it != medoids.end(); it++) { - if (m->control_pressed) { break; } - seqDist temp(*it, *it, dists[i][*it]); //medoid, medoid, distance from medoid to sample - tempMeds.push_back(temp); - } - sort(tempMeds.begin(), tempMeds.end(), compareSequenceDistance); //sort by distance - whoDp.push_back(tempMeds[1].seq1); //second closest medoid - } - - - vector a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0); - - //calc a - all a[i] are the same in the same partition - for (int k = 0; k < numPartitions; k++) { - if (m->control_pressed) { break; } - - int count = 0; - double totalA = 0.0; - for (int i = 0; i < numSamples; i++) { - for (int j = 0; j < numSamples; j++) { - if (m->control_pressed) { break; } - if ((zMatrix[k][i] != 0) && (zMatrix[k][j] != 0)){ //are both samples in the partition, if so add there distance - totalA += dists[j][i]; //distance from this sample to the other samples in the partition - count++; - } - } - } - totalA /= (double) count; - - //set a[i] to average for cluster - for (int i = 0; i < numSamples; i++) { - if (zMatrix[k][i] != 0) { a[i] = totalA; } - } - } - - //calc b - for (int i = 0; i < numSamples; i++) { - if (m->control_pressed) { break; } - - int nextClosestMedoid = whoDp[i]; - int thisPartition = medoid2Partition[nextClosestMedoid]; - int count = 0; - double totalB = 0.0; - for (int j = 0; j < numSamples; j++) { - if (zMatrix[thisPartition][j] != 0) { //this sample is in this partition - totalB += dists[i][j]; - count++; - } - } - b[i] = totalB / (double) count; - } - - //calc silhouettes - for (int i = 0; i < numSamples; i++) { - if (m->control_pressed) { break; } - - double denom = a[i]; - if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i]) - - silhouettes[i] = (b[i] - a[i]) / denom; - - //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl; - } - - return silhouettes; - } - catch(exception& e) { - m->errorOut(e, "Pam", "calcSilhouettes"); - exit(1); - } -} /**************************************************************************************************/ /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: CHk=Bk/(k−1)/Wk/(n−k) where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ +//based on R index.G1.r function double Pam::calcCHIndex(vector< vector > dists){ //countMatrix = [numSamples][numOtus] try { double CH = 0.0; @@ -340,7 +252,7 @@ double Pam::calcCHIndex(vector< vector > dists){ //countMatrix = [numSam } //make countMatrix a relabund - vector > relativeAbundance(numSamples); + vector > relativeAbundance(numSamples); //[numSamples][numOTUs] //get relative abundance for(int i=0;icontrol_pressed) { return 0; } @@ -366,25 +278,32 @@ double Pam::calcCHIndex(vector< vector > dists){ //countMatrix = [numSam countPartitions++; } - double sumBetweenCluster = 0.0; - double sumWithinClusters = 0.0; + //centers.clear(); + //centers = calcCenters(dists, clusterMap, relativeAbundance); + + double allMeanDist = rMedoid(relativeAbundance, dists); + + if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); } + + for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples + for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus + if (m->control_pressed) { return 0; } + //x <- (x - centers[cl, ])^2 + relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j])); + } + } - for (int i = 0; i < numSamples; i++) { //lt - for (int j = 0; j < i; j++) { + double wgss = 0.0; + for (int j = 0; j < numOTUs; j++) { + for(int i=0;icontrol_pressed) { return 0.0; } - int partitionI = clusterMap[i]; - int partitionJ = clusterMap[j]; - - if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk - sumWithinClusters += (dists[i][j] * dists[i][j]); - }else { //they are NOT from the same cluster so this distance is added to Bk - sumBetweenCluster += (dists[i][j] * dists[i][j]); - } + wgss += relativeAbundance[i][j]; } } - //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' << (numSamples - numPartitions) << endl; - CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions)); + double bgss = allMeanDist - wgss; + + CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions)); return CH; } @@ -397,4 +316,3 @@ double Pam::calcCHIndex(vector< vector > dists){ //countMatrix = [numSam /**************************************************************************************************/ - diff --git a/pam.h b/pam.h index 9930176..ab5c68f 100644 --- a/pam.h +++ b/pam.h @@ -17,7 +17,6 @@ class Pam : public CommunityTypeFinder { public: Pam(vector >, vector >, int); - vector calcSilhouettes(vector< vector< double> >); double calcCHIndex(vector< vector< double> >); private: