5 // Created by SarahsWork on 12/3/13.
6 // Copyright (c) 2013 Schloss Lab. All rights reserved.
9 #include "communitytype.h"
11 /**************************************************************************************************/
13 //can we get these psi/psi1 calculations into their own math class?
14 //psi calcualtions swiped from gsl library...
16 static const double psi_cs[23] = {
42 static double apsi_cs[16] = {
61 /**************************************************************************************************/
62 /* coefficients for Maclaurin summation in hzeta()
65 static double hzeta_c[15] = {
66 1.00000000000000000000000000000,
67 0.083333333333333333333333333333,
68 -0.00138888888888888888888888888889,
69 0.000033068783068783068783068783069,
70 -8.2671957671957671957671957672e-07,
71 2.0876756987868098979210090321e-08,
72 -5.2841901386874931848476822022e-10,
73 1.3382536530684678832826980975e-11,
74 -3.3896802963225828668301953912e-13,
75 8.5860620562778445641359054504e-15,
76 -2.1748686985580618730415164239e-16,
77 5.5090028283602295152026526089e-18,
78 -1.3954464685812523340707686264e-19,
79 3.5347070396294674716932299778e-21,
80 -8.9535174270375468504026113181e-23
83 /**************************************************************************************************/
84 void CommunityTypeFinder::printSilData(ofstream& out, double chi, vector<double> sils){
86 out << setprecision (6) << numPartitions << '\t' << chi << '\t';
87 for (int i = 0; i < sils.size(); i++) {
88 out << sils[i] << '\t';
95 m->errorOut(e, "CommunityTypeFinder", "printSilData");
99 /**************************************************************************************************/
100 void CommunityTypeFinder::printSilData(ostream& out, double chi, vector<double> sils){
102 out << setprecision (6) << numPartitions << '\t' << chi << '\t';
103 m->mothurOutJustToLog(toString(numPartitions) + '\t' + toString(chi) + '\t');
104 for (int i = 0; i < sils.size(); i++) {
105 out << sils[i] << '\t';
106 m->mothurOutJustToLog(toString(sils[i]) + '\t');
109 m->mothurOutJustToLog("\n");
114 m->errorOut(e, "CommunityTypeFinder", "printSilData");
118 /**************************************************************************************************/
120 void CommunityTypeFinder::printZMatrix(string fileName, vector<string> sampleName){
122 ofstream printMatrix;
123 m->openOutputFile(fileName, printMatrix); //(fileName.c_str());
124 printMatrix.setf(ios::fixed, ios::floatfield);
125 printMatrix.setf(ios::showpoint);
127 for(int i=0;i<numPartitions;i++){ printMatrix << "\tPartition_" << i+1; } printMatrix << endl;
129 for(int i=0;i<numSamples;i++){
130 printMatrix << sampleName[i];
131 for(int j=0;j<numPartitions;j++){
132 printMatrix << setprecision(4) << '\t' << zMatrix[j][i];
138 catch(exception& e) {
139 m->errorOut(e, "CommunityTypeFinder", "printZMatrix");
144 /**************************************************************************************************/
146 void CommunityTypeFinder::printRelAbund(string fileName, vector<string> otuNames){
149 m->openOutputFile(fileName, printRA); //(fileName.c_str());
150 printRA.setf(ios::fixed, ios::floatfield);
151 printRA.setf(ios::showpoint);
153 vector<double> totals(numPartitions, 0.0000);
154 for(int i=0;i<numPartitions;i++){
155 for(int j=0;j<numOTUs;j++){
156 totals[i] += exp(lambdaMatrix[i][j]);
161 for(int i=0;i<numPartitions;i++){
162 printRA << "\tPartition_" << i+1 << '_' << setprecision(4) << totals[i];
163 printRA << "\tPartition_" << i+1 <<"_LCI" << "\tPartition_" << i+1 << "_UCI";
167 for(int i=0;i<numOTUs;i++){
169 if (m->control_pressed) { break; }
171 printRA << otuNames[i];
172 for(int j=0;j<numPartitions;j++){
174 if(error[j][i] >= 0.0000){
175 double std = sqrt(error[j][i]);
176 printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j];
177 printRA << '\t' << 100 * exp(lambdaMatrix[j][i] - 2.0 * std) / totals[j];
178 printRA << '\t' << 100 * exp(lambdaMatrix[j][i] + 2.0 * std) / totals[j];
181 printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j];
182 printRA << '\t' << "NA";
183 printRA << '\t' << "NA";
191 catch(exception& e) {
192 m->errorOut(e, "CommunityTypeFinder", "printRelAbund");
199 /**************************************************************************************************/
201 vector<vector<double> > CommunityTypeFinder::getHessian(){
203 vector<double> alpha(numOTUs, 0.0000);
204 double alphaSum = 0.0000;
206 vector<double> pi = zMatrix[currentPartition];
207 vector<double> psi_ajk(numOTUs, 0.0000);
208 vector<double> psi_cjk(numOTUs, 0.0000);
209 vector<double> psi1_ajk(numOTUs, 0.0000);
210 vector<double> psi1_cjk(numOTUs, 0.0000);
212 for(int j=0;j<numOTUs;j++){
214 if (m->control_pressed) { break; }
216 alpha[j] = exp(lambdaMatrix[currentPartition][j]);
217 alphaSum += alpha[j];
219 for(int i=0;i<numSamples;i++){
220 double X = (double) countMatrix[i][j];
222 psi_ajk[j] += pi[i] * psi(alpha[j]);
223 psi1_ajk[j] += pi[i] * psi1(alpha[j]);
225 psi_cjk[j] += pi[i] * psi(alpha[j] + X);
226 psi1_cjk[j] += pi[i] * psi1(alpha[j] + X);
231 double psi_Ck = 0.0000;
232 double psi1_Ck = 0.0000;
234 double weight = 0.0000;
236 for(int i=0;i<numSamples;i++){
237 if (m->control_pressed) { break; }
240 for(int j=0;j<numOTUs;j++){ sum += alpha[j] + countMatrix[i][j]; }
242 psi_Ck += pi[i] * psi(sum);
243 psi1_Ck += pi[i] * psi1(sum);
246 double psi_Ak = weight * psi(alphaSum);
247 double psi1_Ak = weight * psi1(alphaSum);
249 vector<vector<double> > hessian(numOTUs);
250 for(int i=0;i<numOTUs;i++){ hessian[i].assign(numOTUs, 0.0000); }
252 for(int i=0;i<numOTUs;i++){
253 if (m->control_pressed) { break; }
254 double term1 = -alpha[i] * (- psi_ajk[i] + psi_Ak + psi_cjk[i] - psi_Ck);
255 double term2 = -alpha[i] * alpha[i] * (-psi1_ajk[i] + psi1_Ak + psi1_cjk[i] - psi1_Ck);
256 double term3 = 0.1 * alpha[i];
258 hessian[i][i] = term1 + term2 + term3;
260 for(int j=0;j<i;j++){
261 hessian[i][j] = - alpha[i] * alpha[j] * (psi1_Ak - psi1_Ck);
262 hessian[j][i] = hessian[i][j];
269 m->errorOut(e, "CommunityTypeFinder", "getHessian");
273 /**************************************************************************************************/
275 double CommunityTypeFinder::psi1(double xx){
278 /* Euler-Maclaurin summation formula
279 * [Moshier, p. 400, with several typo corrections]
286 const double pmax = pow(kmax + xx, -s);
288 double pcp = pmax / (kmax + xx);
289 double value = pmax*((kmax+xx)/(s-1.0) + 0.5);
291 for(k=0; k<kmax; k++) {
292 if (m->control_pressed) { return 0; }
293 value += pow(k + xx, -s);
296 for(j=0; j<=jmax; j++) {
297 if (m->control_pressed) { return 0; }
298 double delta = hzeta_c[j+1] * scp * pcp;
301 if(fabs(delta/value) < 0.5*EPSILON) break;
303 scp *= (s+2*j+1)*(s+2*j+2);
304 pcp /= (kmax + xx)*(kmax + xx);
310 m->errorOut(e, "CommunityTypeFinder", "psi1");
315 /**************************************************************************************************/
317 double CommunityTypeFinder::psi(double xx){
319 double psiX = 0.0000;
323 double t1 = 1.0 / xx;
324 psiX = cheb_eval(psi_cs, 22, 2.0*xx-1.0);
328 else if(xx < 2.0000){
330 const double v = xx - 1.0;
331 psiX = cheb_eval(psi_cs, 22, 2.0*v-1.0);
335 const double t = 8.0/(xx*xx)-1.0;
336 psiX = cheb_eval(apsi_cs, 15, t);
337 psiX += log(xx) - 0.5/xx;
343 m->errorOut(e, "CommunityTypeFinder", "psi");
347 /**************************************************************************************************/
349 double CommunityTypeFinder::cheb_eval(const double seriesData[], int order, double xx){
354 double x2 = xx * 2.0000;
356 for(int j=order;j>=1;j--){
357 if (m->control_pressed) { return 0; }
359 d = x2 * d - dd + seriesData[j];
363 d = xx * d - dd + 0.5 * seriesData[0];
367 m->errorOut(e, "CommunityTypeFinder", "cheb_eval");
371 /**************************************************************************************************/
373 int CommunityTypeFinder::findkMeans(){
375 error.resize(numPartitions); for (int i = 0; i < numPartitions; i++) { error[i].resize(numOTUs, 0.0); }
376 vector<vector<double> > relativeAbundance(numSamples);
377 vector<vector<double> > alphaMatrix;
379 alphaMatrix.resize(numPartitions);
380 lambdaMatrix.resize(numPartitions);
381 for(int i=0;i<numPartitions;i++){
382 alphaMatrix[i].assign(numOTUs, 0);
383 lambdaMatrix[i].assign(numOTUs, 0);
386 //get relative abundance
387 for(int i=0;i<numSamples;i++){
388 if (m->control_pressed) { return 0; }
391 relativeAbundance[i].assign(numOTUs, 0.0);
393 for(int j=0;j<numOTUs;j++){
394 groupTotal += countMatrix[i][j];
396 for(int j=0;j<numOTUs;j++){
397 relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
401 //randomly assign samples into partitions
402 zMatrix.resize(numPartitions);
403 for(int i=0;i<numPartitions;i++){
404 zMatrix[i].assign(numSamples, 0);
409 for (int i = 0; i < numSamples; i++) { temp.push_back(i); }
410 random_shuffle(temp.begin(), temp.end());
412 //assign each partition at least one random sample
413 int numAssignedSamples = 0;
414 for (int i = 0; i < numPartitions; i++) {
415 zMatrix[i][temp[numAssignedSamples]] = 1;
416 numAssignedSamples++;
419 //assign rest of samples to partitions
421 for(int i=numAssignedSamples;i<numSamples;i++){
422 zMatrix[count%numPartitions][temp[i]] = 1;
426 double maxChange = 1;
430 weights.assign(numPartitions, 0);
432 while(maxChange > 1e-6 && iteration < maxIters){
434 if (m->control_pressed) { return 0; }
435 //calcualte average relative abundance
437 for(int i=0;i<numPartitions;i++){
439 double normChange = 0.0;
443 for(int j=0;j<numSamples;j++){
444 weights[i] += (double)zMatrix[i][j];
447 vector<double> averageRelativeAbundance(numOTUs, 0);
448 for(int j=0;j<numOTUs;j++){
449 for(int k=0;k<numSamples;k++){
450 averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j];
454 for(int j=0;j<numOTUs;j++){
455 averageRelativeAbundance[j] /= weights[i];
457 double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
458 normChange += difference * difference;
459 alphaMatrix[i][j] = averageRelativeAbundance[j];
462 normChange = sqrt(normChange);
464 if(normChange > maxChange){ maxChange = normChange; }
468 //calcualte distance between each sample in partition and the average relative abundance
469 for(int i=0;i<numSamples;i++){
470 if (m->control_pressed) { return 0; }
472 double normalizationFactor = 0;
473 vector<double> totalDistToPartition(numPartitions, 0);
475 for(int j=0;j<numPartitions;j++){
476 for(int k=0;k<numOTUs;k++){
477 double difference = alphaMatrix[j][k] - relativeAbundance[i][k];
478 totalDistToPartition[j] += difference * difference;
480 totalDistToPartition[j] = sqrt(totalDistToPartition[j]);
481 normalizationFactor += exp(-50.0 * totalDistToPartition[j]);
485 for(int j=0;j<numPartitions;j++){
486 zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor;
492 // cout << "K means: " << iteration << '\t' << maxChange << endl;
496 // cout << "Iter:-1";
497 for(int i=0;i<numPartitions;i++){
500 for(int j=0;j<numSamples;j++){
501 weights[i] += zMatrix[i][j];
503 // printf("\tw_%d=%.3f", i, weights[i]);
508 for(int i=0;i<numOTUs;i++){
509 if (m->control_pressed) { return 0; }
510 for(int j=0;j<numPartitions;j++){
511 if(alphaMatrix[j][i] > 0){
512 lambdaMatrix[j][i] = log(alphaMatrix[j][i]);
515 lambdaMatrix[j][i] = -10.0;
522 m->errorOut(e, "CommunityTypeFinder", "kMeans");
527 /**************************************************************************************************/
528 //based on r function .medoid
529 //results is length numOTUs and holds the distances from x of the sample in d with the min sum of distances to all other samples.
530 //Basically the "best" medoid.
531 //returns the sum of the distances squared
532 double CommunityTypeFinder::rMedoid(vector< vector<double> > x, vector< vector<double> > d){
534 vector<double> results; results.resize(numOTUs, 0.0);
535 double minSumDist = 1e6;
538 for (int i = 0; i < d.size(); i++) {
539 if (m->control_pressed) { break; }
541 double thisSum = 0.0;
542 for (int j = 0; j < d[i].size(); j++) { thisSum += d[i][j]; }
543 if (thisSum < minSumDist) {
544 minSumDist = thisSum;
549 if (minGroup != -1) {
550 for (int i = 0; i < numOTUs; i++) { results[i] = x[minGroup][i]; } //save minGroups relativeAbundance for each OTU
551 }else { m->mothurOut("[ERROR]: unable to find rMedoid group.\n"); m->control_pressed = true; }
554 double allMeanDist = 0.0;
555 for (int i = 0; i < x.size(); i++) { //numSamples
556 for (int j = 0; j < x[i].size(); j++) { //numOTus
557 if (m->control_pressed) { break; }
558 allMeanDist += ((x[i][j]-results[j])*(x[i][j]-results[j])); //(otuX sampleY - otuX bestMedoid)^2
565 m->errorOut(e, "CommunityTypeFinder", "rMedoid");
569 /**************************************************************************************************/
571 /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
573 CHk=Bk/(k−1)/Wk/(n−k)
575 where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
576 double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){
580 if (numPartitions < 2) { return CH; }
582 map<int, int> clusterMap; //map sample to partition
583 for (int j = 0; j < numSamples; j++) {
584 double maxValue = -1e6;
585 for (int i = 0; i < numPartitions; i++) {
586 if (m->control_pressed) { return 0.0; }
587 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
589 maxValue = zMatrix[i][j];
594 //make countMatrix a relabund
595 vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
596 //get relative abundance
597 for(int i=0;i<numSamples;i++){
598 if (m->control_pressed) { return 0; }
601 relativeAbundance[i].assign(numOTUs, 0.0);
603 for(int j=0;j<numOTUs;j++){
604 groupTotal += countMatrix[i][j];
606 for(int j=0;j<numOTUs;j++){
607 relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
612 vector<vector<double> > centers = calcCenters(dists, clusterMap, relativeAbundance);
614 if (m->control_pressed) { return 0.0; }
616 double allMeanDist = rMedoid(relativeAbundance, dists);
618 if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
620 for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
621 for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
622 if (m->control_pressed) { return 0; }
623 //x <- (x - centers[cl, ])^2
624 relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
629 for (int j = 0; j < numOTUs; j++) {
630 for(int i=0;i<numSamples;i++){
631 if (m->control_pressed) { return 0.0; }
632 wgss += relativeAbundance[i][j];
636 double bgss = allMeanDist - wgss;
638 CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
643 m->errorOut(e, "CommunityTypeFinder", "calcCHIndex");
649 /**************************************************************************************************/
650 vector<vector<double> > CommunityTypeFinder::calcCenters(vector<vector<double> >& dists, map<int, int> clusterMap, vector<vector<double> >& relativeAbundance) { //[numsamples][numsamples]
653 //choose sample with smallest sum of squared dists
654 // cout << "Here" << clusterMap.size() << endl;
655 // for(map<int, int>::iterator it = clusterMap.begin(); it != clusterMap.end(); it++) { cout << it->first << '\t' << it->second <<endl; }
656 vector<vector<double> > centers; centers.resize(numPartitions);
657 vector<double> sums; sums.resize(numSamples, 0.0);
658 map<int, vector<int> > partition2Samples; //maps partitions to samples in the partition
659 map<int, vector<int> >::iterator it;
661 for (int i = 0; i < numSamples; i++) {
662 int partitionI = clusterMap[i];
664 //add this sample to list of samples in this partition for access later
665 it = partition2Samples.find(partitionI);
666 if (it == partition2Samples.end()) {
667 vector<int> temp; temp.push_back(i);
668 partition2Samples[partitionI] = temp;
669 }else { partition2Samples[partitionI].push_back(i); }
671 for (int j = 0; j < numSamples; j++) {
673 int partitionJ = clusterMap[j];
675 if (partitionI == partitionJ) { //if you are a distance between samples in the same cluster
676 sums[i] += dists[i][j];
677 sums[j] += dists[i][j];
678 }else{}//we dont' care about distance between clusters
682 vector<int> medoidsVector; medoidsVector.resize(numPartitions, -1);
683 for (it = partition2Samples.begin(); it != partition2Samples.end(); it++) { //for each partition look for sample with smallest squared
685 //sum dist to all other samples in cluster
686 vector<int> members = it->second;
687 double minSumDist = 1e6;
688 for (int i = 0; i < members.size(); i++) {
689 if (m->control_pressed) { return centers; }
690 if (sums[members[i]] < minSumDist) {
691 minSumDist = sums[members[i]];
692 medoidsVector[it->first] = members[i];
699 for (int i = 0; i < medoidsVector.size(); i++) {
700 medoids.insert(medoidsVector[i]);
703 int countPartitions = 0;
704 for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
705 for (int j = 0; j < numOTUs; j++) {
706 centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU
714 m->errorOut(e, "CommunityTypeFinder", "calcCenters");
719 /**************************************************************************************************/
720 //The silhouette width S(i)of individual data points i is calculated using the following formula:
725 where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
727 The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
729 //based on silouette.r which calls sildist.c written by Francois Romain
730 vector<double> CommunityTypeFinder::calcSilhouettes(vector<vector<double> > dists) {
732 vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
733 if (numPartitions < 2) { return silhouettes; }
736 map<int, int> clusterMap; //map sample to partition
737 for (int j = 0; j < numSamples; j++) {
738 double maxValue = 0.0;
739 for (int i = 0; i < numPartitions; i++) {
740 if (m->control_pressed) { return silhouettes; }
741 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
743 maxValue = zMatrix[i][j];
748 //count number of samples in each partition
749 vector<int> counts; counts.resize(numPartitions, 0);
750 vector<double> DiC; DiC.resize((numPartitions*numSamples), 0.0);
751 bool computeSi = true;
753 for (int i = 0; i < numSamples; i++) {
754 int partitionI = clusterMap[i];
755 counts[partitionI]++;
757 for (int j = i+1; j < numSamples; j++) {
758 if (m->control_pressed) { return silhouettes; }
759 int partitionJ = clusterMap[j];
761 DiC[numPartitions*i+partitionJ] += dists[i][j];
762 DiC[numPartitions*j+partitionI] += dists[i][j];
766 vector<int> neighbor; neighbor.resize(numSamples, -1);
767 for (int i = 0; i < numSamples; i++) {
768 if (m->control_pressed) { return silhouettes; }
769 int ki = numPartitions*i;
770 int partitionI = clusterMap[i];
773 for (int j = 0; j < numPartitions; j++) {
774 if (j == partitionI) {
775 if (counts[j] == 1) { //only one sample in cluster
777 }else { DiC[ki+j] /= (counts[j]-1); }
779 DiC[ki+j] /= counts[j];
783 double ai = DiC[ki+partitionI];
786 if (partitionI == 0) { bi = DiC[ki+1]; neighbor[i] = 2; }
787 else { bi = DiC[ki]; neighbor[i] = 1; }
789 for (int j = 1; j < numPartitions; j++) {
790 if (j != partitionI) {
791 if (bi > DiC[ki+j]) {
798 silhouettes[i] = 0.0;
799 if (computeSi && bi != ai) {
800 silhouettes[i] = (bi-ai) / (max(ai, bi));
806 catch(exception& e) {
807 m->errorOut(e, "CommunityTypeFinder", "calcSilhouettes");
811 /**************************************************************************************************/