]> git.donarmstrong.com Git - mothur.git/blob - communitytype.cpp
fixes while testing 1.33.0
[mothur.git] / communitytype.cpp
1 //
2 //  communitytype.cpp
3 //  Mothur
4 //
5 //  Created by SarahsWork on 12/3/13.
6 //  Copyright (c) 2013 Schloss Lab. All rights reserved.
7 //
8
9 #include "communitytype.h"
10
11 /**************************************************************************************************/
12
13 //can we get these psi/psi1 calculations into their own math class?
14 //psi calcualtions swiped from gsl library...
15
16 static const double psi_cs[23] = {
17     -.038057080835217922,
18     .491415393029387130,
19     -.056815747821244730,
20     .008357821225914313,
21     -.001333232857994342,
22     .000220313287069308,
23     -.000037040238178456,
24     .000006283793654854,
25     -.000001071263908506,
26     .000000183128394654,
27     -.000000031353509361,
28     .000000005372808776,
29     -.000000000921168141,
30     .000000000157981265,
31     -.000000000027098646,
32     .000000000004648722,
33     -.000000000000797527,
34     .000000000000136827,
35     -.000000000000023475,
36     .000000000000004027,
37     -.000000000000000691,
38     .000000000000000118,
39     -.000000000000000020
40 };
41
42 static double apsi_cs[16] = {
43     -.0204749044678185,
44     -.0101801271534859,
45     .0000559718725387,
46     -.0000012917176570,
47     .0000000572858606,
48     -.0000000038213539,
49     .0000000003397434,
50     -.0000000000374838,
51     .0000000000048990,
52     -.0000000000007344,
53     .0000000000001233,
54     -.0000000000000228,
55     .0000000000000045,
56     -.0000000000000009,
57     .0000000000000002,
58     -.0000000000000000
59 };
60
61 /**************************************************************************************************/
62 /* coefficients for Maclaurin summation in hzeta()
63  * B_{2j}/(2j)!
64  */
65 static double hzeta_c[15] = {
66     1.00000000000000000000000000000,
67     0.083333333333333333333333333333,
68     -0.00138888888888888888888888888889,
69     0.000033068783068783068783068783069,
70     -8.2671957671957671957671957672e-07,
71     2.0876756987868098979210090321e-08,
72     -5.2841901386874931848476822022e-10,
73     1.3382536530684678832826980975e-11,
74     -3.3896802963225828668301953912e-13,
75     8.5860620562778445641359054504e-15,
76     -2.1748686985580618730415164239e-16,
77     5.5090028283602295152026526089e-18,
78     -1.3954464685812523340707686264e-19,
79     3.5347070396294674716932299778e-21,
80     -8.9535174270375468504026113181e-23
81 };
82
83 /**************************************************************************************************/
84 void CommunityTypeFinder::printSilData(ofstream& out, double chi, vector<double> sils){
85     try {
86         out << setprecision (6) << numPartitions << '\t'  << chi << '\t';
87         for (int i = 0; i < sils.size(); i++) {
88             out << sils[i] << '\t';
89         }
90         out << endl;
91         
92         return;
93     }
94     catch(exception& e){
95         m->errorOut(e, "CommunityTypeFinder", "printSilData");
96         exit(1);
97     }
98 }
99 /**************************************************************************************************/
100 void CommunityTypeFinder::printSilData(ostream& out, double chi, vector<double> sils){
101     try {
102         out << setprecision (6) << numPartitions << '\t'  << chi << '\t';
103         m->mothurOutJustToLog(toString(numPartitions) + '\t' + toString(chi) + '\t');
104         for (int i = 0; i < sils.size(); i++) {
105             out << sils[i] << '\t';
106             m->mothurOutJustToLog(toString(sils[i]) + '\t');
107         }
108         out << endl;
109         m->mothurOutJustToLog("\n");
110         
111         return;
112     }
113     catch(exception& e){
114         m->errorOut(e, "CommunityTypeFinder", "printSilData");
115         exit(1);
116     }
117 }
118 /**************************************************************************************************/
119
120 void CommunityTypeFinder::printZMatrix(string fileName, vector<string> sampleName){
121     try {
122         ofstream printMatrix;
123         m->openOutputFile(fileName, printMatrix); //(fileName.c_str());
124         printMatrix.setf(ios::fixed, ios::floatfield);
125         printMatrix.setf(ios::showpoint);
126         
127         for(int i=0;i<numPartitions;i++){   printMatrix << "\tPartition_" << i+1;   }   printMatrix << endl;
128         
129         for(int i=0;i<numSamples;i++){
130             printMatrix << sampleName[i];
131             for(int j=0;j<numPartitions;j++){
132                 printMatrix << setprecision(4) << '\t' << zMatrix[j][i];
133             }
134             printMatrix << endl;
135         }
136         printMatrix.close();
137     }
138         catch(exception& e) {
139                 m->errorOut(e, "CommunityTypeFinder", "printZMatrix");
140                 exit(1);
141         }
142 }
143
144 /**************************************************************************************************/
145
146 void CommunityTypeFinder::printRelAbund(string fileName, vector<string> otuNames){
147     try {
148         ofstream printRA;
149         m->openOutputFile(fileName, printRA); //(fileName.c_str());
150         printRA.setf(ios::fixed, ios::floatfield);
151         printRA.setf(ios::showpoint);
152         
153         vector<double> totals(numPartitions, 0.0000);
154         for(int i=0;i<numPartitions;i++){
155             for(int j=0;j<numOTUs;j++){
156                 totals[i] += exp(lambdaMatrix[i][j]);
157             }
158         }
159         
160         printRA << "Taxon";
161         for(int i=0;i<numPartitions;i++){
162             printRA << "\tPartition_" << i+1 << '_' << setprecision(4) << totals[i];
163             printRA << "\tPartition_" << i+1 <<"_LCI" << "\tPartition_" << i+1 << "_UCI";
164         }
165         printRA << endl;
166         
167         for(int i=0;i<numOTUs;i++){
168             
169             if (m->control_pressed) { break; }
170             
171             printRA << otuNames[i];
172             for(int j=0;j<numPartitions;j++){
173                 
174                 if(error[j][i] >= 0.0000){
175                     double std = sqrt(error[j][i]);
176                     printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j];
177                     printRA << '\t' << 100 * exp(lambdaMatrix[j][i] - 2.0 * std) / totals[j];
178                     printRA << '\t' << 100 * exp(lambdaMatrix[j][i] + 2.0 * std) / totals[j];
179                 }
180                 else{
181                     printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j];
182                     printRA << '\t' << "NA";
183                     printRA << '\t' << "NA";
184                 }
185             }
186             printRA << endl;
187         }
188         
189         printRA.close();
190     }
191         catch(exception& e) {
192                 m->errorOut(e, "CommunityTypeFinder", "printRelAbund");
193                 exit(1);
194         }
195 }
196
197
198
199 /**************************************************************************************************/
200
201 vector<vector<double> > CommunityTypeFinder::getHessian(){
202     try {
203         vector<double> alpha(numOTUs, 0.0000);
204         double alphaSum = 0.0000;
205         
206         vector<double> pi = zMatrix[currentPartition];
207         vector<double> psi_ajk(numOTUs, 0.0000);
208         vector<double> psi_cjk(numOTUs, 0.0000);
209         vector<double> psi1_ajk(numOTUs, 0.0000);
210         vector<double> psi1_cjk(numOTUs, 0.0000);
211         
212         for(int j=0;j<numOTUs;j++){
213             
214             if (m->control_pressed) {  break; }
215             
216             alpha[j] = exp(lambdaMatrix[currentPartition][j]);
217             alphaSum += alpha[j];
218             
219             for(int i=0;i<numSamples;i++){
220                 double X = (double) countMatrix[i][j];
221                 
222                 psi_ajk[j] += pi[i] * psi(alpha[j]);
223                 psi1_ajk[j] += pi[i] * psi1(alpha[j]);
224                 
225                 psi_cjk[j] += pi[i] * psi(alpha[j] + X);
226                 psi1_cjk[j] += pi[i] * psi1(alpha[j] + X);
227             }
228         }
229         
230         
231         double psi_Ck = 0.0000;
232         double psi1_Ck = 0.0000;
233         
234         double weight = 0.0000;
235         
236         for(int i=0;i<numSamples;i++){
237             if (m->control_pressed) {  break; }
238             weight += pi[i];
239             double sum = 0.0000;
240             for(int j=0;j<numOTUs;j++){     sum += alpha[j] + countMatrix[i][j];    }
241             
242             psi_Ck += pi[i] * psi(sum);
243             psi1_Ck += pi[i] * psi1(sum);
244         }
245         
246         double psi_Ak = weight * psi(alphaSum);
247         double psi1_Ak = weight * psi1(alphaSum);
248         
249         vector<vector<double> > hessian(numOTUs);
250         for(int i=0;i<numOTUs;i++){ hessian[i].assign(numOTUs, 0.0000); }
251         
252         for(int i=0;i<numOTUs;i++){
253             if (m->control_pressed) {  break; }
254             double term1 = -alpha[i] * (- psi_ajk[i] + psi_Ak + psi_cjk[i] - psi_Ck);
255             double term2 = -alpha[i] * alpha[i] * (-psi1_ajk[i] + psi1_Ak + psi1_cjk[i] - psi1_Ck);
256             double term3 = 0.1 * alpha[i];
257             
258             hessian[i][i] = term1 + term2 + term3;
259             
260             for(int j=0;j<i;j++){
261                 hessian[i][j] = - alpha[i] * alpha[j] * (psi1_Ak - psi1_Ck);
262                 hessian[j][i] = hessian[i][j];
263             }
264         }
265         
266         return hessian;
267     }
268     catch(exception& e){
269         m->errorOut(e, "CommunityTypeFinder", "getHessian");
270         exit(1);
271     }
272 }
273 /**************************************************************************************************/
274
275 double CommunityTypeFinder::psi1(double xx){
276     try {
277         
278         /* Euler-Maclaurin summation formula
279          * [Moshier, p. 400, with several typo corrections]
280          */
281         
282         double s = 2.0000;
283         const int jmax = 12;
284         const int kmax = 10;
285         int j, k;
286         const double pmax  = pow(kmax + xx, -s);
287         double scp = s;
288         double pcp = pmax / (kmax + xx);
289         double value = pmax*((kmax+xx)/(s-1.0) + 0.5);
290         
291         for(k=0; k<kmax; k++) {
292             if (m->control_pressed) {  return 0; }
293             value += pow(k + xx, -s);
294         }
295         
296         for(j=0; j<=jmax; j++) {
297             if (m->control_pressed) {  return 0; }
298             double delta = hzeta_c[j+1] * scp * pcp;
299             value += delta;
300             
301             if(fabs(delta/value) < 0.5*EPSILON) break;
302             
303             scp *= (s+2*j+1)*(s+2*j+2);
304             pcp /= (kmax + xx)*(kmax + xx);
305         }
306         
307         return value;
308     }
309     catch(exception& e){
310         m->errorOut(e, "CommunityTypeFinder", "psi1");
311         exit(1);
312     }
313 }
314
315 /**************************************************************************************************/
316
317 double CommunityTypeFinder::psi(double xx){
318     try {
319         double psiX = 0.0000;
320         
321         if(xx < 1.0000){
322             
323             double t1 = 1.0 / xx;
324             psiX = cheb_eval(psi_cs, 22, 2.0*xx-1.0);
325             psiX = -t1 + psiX;
326             
327         }
328         else if(xx < 2.0000){
329             
330             const double v = xx - 1.0;
331             psiX = cheb_eval(psi_cs, 22, 2.0*v-1.0);
332             
333         }
334         else{
335             const double t = 8.0/(xx*xx)-1.0;
336             psiX = cheb_eval(apsi_cs, 15, t);
337             psiX += log(xx) - 0.5/xx;
338         }
339         
340         return psiX;
341     }
342     catch(exception& e){
343         m->errorOut(e, "CommunityTypeFinder", "psi");
344         exit(1);
345     }
346 }
347 /**************************************************************************************************/
348
349 double CommunityTypeFinder::cheb_eval(const double seriesData[], int order, double xx){
350     try {
351         double d = 0.0000;
352         double dd = 0.0000;
353         
354         double x2 = xx * 2.0000;
355         
356         for(int j=order;j>=1;j--){
357             if (m->control_pressed) {  return 0; }
358             double temp = d;
359             d = x2 * d - dd + seriesData[j];
360             dd = temp;
361         }
362         
363         d = xx * d - dd + 0.5 * seriesData[0];
364         return d;
365     }
366     catch(exception& e){
367         m->errorOut(e, "CommunityTypeFinder", "cheb_eval");
368         exit(1);
369     }
370 }
371 /**************************************************************************************************/
372
373 int CommunityTypeFinder::findkMeans(){
374     try {
375         error.resize(numPartitions); for (int i = 0; i < numPartitions; i++) { error[i].resize(numOTUs, 0.0); }
376         vector<vector<double> > relativeAbundance(numSamples);
377         vector<vector<double> > alphaMatrix;
378         
379         alphaMatrix.resize(numPartitions);
380         lambdaMatrix.resize(numPartitions);
381         for(int i=0;i<numPartitions;i++){
382             alphaMatrix[i].assign(numOTUs, 0);
383             lambdaMatrix[i].assign(numOTUs, 0);
384         }
385         
386         //get relative abundance
387         for(int i=0;i<numSamples;i++){
388             if (m->control_pressed) {  return 0; }
389             int groupTotal = 0;
390             
391             relativeAbundance[i].assign(numOTUs, 0.0);
392             
393             for(int j=0;j<numOTUs;j++){
394                 groupTotal += countMatrix[i][j];
395             }
396             for(int j=0;j<numOTUs;j++){
397                 relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
398             }
399         }
400         
401         //randomly assign samples into partitions
402         zMatrix.resize(numPartitions);
403         for(int i=0;i<numPartitions;i++){
404             zMatrix[i].assign(numSamples, 0);
405         }
406         
407         //randomize samples
408         vector<int> temp;
409         for (int i = 0; i < numSamples; i++) { temp.push_back(i); }
410         random_shuffle(temp.begin(), temp.end());
411         
412         //assign each partition at least one random sample
413         int numAssignedSamples = 0;
414         for (int i = 0; i < numPartitions; i++) {
415             zMatrix[i][temp[numAssignedSamples]] = 1;
416             numAssignedSamples++;
417         }
418         
419         //assign rest of samples to partitions
420         int count = 0;
421         for(int i=numAssignedSamples;i<numSamples;i++){
422             zMatrix[count%numPartitions][temp[i]] = 1;
423             count++;
424         }
425         
426         double maxChange = 1;
427         int maxIters = 1000;
428         int iteration = 0;
429         
430         weights.assign(numPartitions, 0);
431         
432         while(maxChange > 1e-6 && iteration < maxIters){
433             
434             if (m->control_pressed) {  return 0; }
435             //calcualte average relative abundance
436             maxChange = 0.0000;
437             for(int i=0;i<numPartitions;i++){
438                 
439                 double normChange = 0.0;
440                 
441                 weights[i] = 0;
442                 
443                 for(int j=0;j<numSamples;j++){
444                     weights[i] += (double)zMatrix[i][j];
445                 }
446                 
447                 vector<double> averageRelativeAbundance(numOTUs, 0);
448                 for(int j=0;j<numOTUs;j++){
449                     for(int k=0;k<numSamples;k++){
450                         averageRelativeAbundance[j] += zMatrix[i][k] * relativeAbundance[k][j];
451                     }
452                 }
453                 
454                 for(int j=0;j<numOTUs;j++){
455                     averageRelativeAbundance[j] /= weights[i];
456                     
457                     double difference = averageRelativeAbundance[j] - alphaMatrix[i][j];
458                     normChange += difference * difference;
459                     alphaMatrix[i][j] = averageRelativeAbundance[j];
460                 }
461                 
462                 normChange = sqrt(normChange);
463                 
464                 if(normChange > maxChange){ maxChange = normChange; }
465             }
466             
467             
468             //calcualte distance between each sample in partition and the average relative abundance
469             for(int i=0;i<numSamples;i++){
470                 if (m->control_pressed) {  return 0; }
471                 
472                 double normalizationFactor = 0;
473                 vector<double> totalDistToPartition(numPartitions, 0);
474                 
475                 for(int j=0;j<numPartitions;j++){
476                     for(int k=0;k<numOTUs;k++){
477                         double difference = alphaMatrix[j][k] - relativeAbundance[i][k];
478                         totalDistToPartition[j] += difference * difference;
479                     }
480                     totalDistToPartition[j] = sqrt(totalDistToPartition[j]);
481                     normalizationFactor += exp(-50.0 * totalDistToPartition[j]);
482                 }
483                 
484                 
485                 for(int j=0;j<numPartitions;j++){
486                     zMatrix[j][i] = exp(-50.0 * totalDistToPartition[j]) / normalizationFactor;
487                 }
488                 
489             }
490             
491             iteration++;
492             //        cout << "K means: " << iteration << '\t' << maxChange << endl;
493             
494         }
495         
496         //    cout << "Iter:-1";
497         for(int i=0;i<numPartitions;i++){
498             weights[i] = 0.0000;
499             
500             for(int j=0;j<numSamples;j++){
501                 weights[i] += zMatrix[i][j];
502             }
503             //        printf("\tw_%d=%.3f", i, weights[i]);
504         }
505         //    cout << endl;
506         
507         
508         for(int i=0;i<numOTUs;i++){
509             if (m->control_pressed) {  return 0; }
510             for(int j=0;j<numPartitions;j++){
511                 if(alphaMatrix[j][i] > 0){
512                     lambdaMatrix[j][i] = log(alphaMatrix[j][i]);
513                 }
514                 else{
515                     lambdaMatrix[j][i] = -10.0;
516                 }
517             }
518         }
519         return 0;
520     }
521     catch(exception& e){
522         m->errorOut(e, "CommunityTypeFinder", "kMeans");
523         exit(1);
524     }
525 }
526
527 /**************************************************************************************************/
528 //based on r function .medoid
529 //results is length numOTUs and holds the distances from x of the sample in d with the min sum of distances to all other samples.
530 //Basically the "best" medoid.
531 //returns the sum of the distances squared
532 double CommunityTypeFinder::rMedoid(vector< vector<double> > x, vector< vector<double> > d){
533     try {
534         vector<double> results; results.resize(numOTUs, 0.0);
535         double minSumDist = 1e6;
536         int minGroup = -1;
537         
538         for (int i = 0; i < d.size(); i++) {
539             if (m->control_pressed) { break; }
540             
541             double thisSum = 0.0;
542             for (int j = 0; j < d[i].size(); j++) { thisSum += d[i][j];  }
543             if (thisSum < minSumDist) {
544                 minSumDist = thisSum;
545                 minGroup = i;
546             }
547         }
548         
549         if (minGroup != -1) {
550             for (int i = 0; i < numOTUs; i++) {  results[i] = x[minGroup][i];  } //save minGroups relativeAbundance for each OTU
551         }else { m->mothurOut("[ERROR]: unable to find rMedoid group.\n"); m->control_pressed = true; }
552         
553         
554         double allMeanDist = 0.0;
555         for (int i = 0; i < x.size(); i++) { //numSamples
556             for (int j = 0; j < x[i].size(); j++) { //numOTus
557                 if (m->control_pressed) { break; }
558                 allMeanDist += ((x[i][j]-results[j])*(x[i][j]-results[j])); //(otuX sampleY - otuX bestMedoid)^2
559                 
560             }
561         }
562         return allMeanDist;
563     }
564     catch(exception& e){
565         m->errorOut(e, "CommunityTypeFinder", "rMedoid");
566         exit(1);
567     }
568 }
569 /**************************************************************************************************/
570
571 /*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as:
572  
573  CHk=Bk/(k−1)/Wk/(n−k)
574  
575  where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/
576 double CommunityTypeFinder::calcCHIndex(vector< vector< double> > dists){
577     try {
578         double CH = 0.0;
579         
580         if (numPartitions < 2) { return CH; }
581         
582         map<int, int> clusterMap; //map sample to partition
583         for (int j = 0; j < numSamples; j++) {
584             double maxValue = -1e6;
585             for (int i = 0; i < numPartitions; i++) {
586                 if (m->control_pressed) { return 0.0; }
587                 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
588                     clusterMap[j] = i;
589                     maxValue = zMatrix[i][j];
590                 }
591             }
592         }
593         
594         //make countMatrix a relabund
595         vector<vector<double> > relativeAbundance(numSamples); //[numSamples][numOTUs]
596         //get relative abundance
597         for(int i=0;i<numSamples;i++){
598             if (m->control_pressed) {  return 0; }
599             int groupTotal = 0;
600             
601             relativeAbundance[i].assign(numOTUs, 0.0);
602             
603             for(int j=0;j<numOTUs;j++){
604                 groupTotal += countMatrix[i][j];
605             }
606             for(int j=0;j<numOTUs;j++){
607                 relativeAbundance[i][j] = countMatrix[i][j] / (double)groupTotal;
608             }
609         }
610         
611         //find centers
612         vector<vector<double> > centers = calcCenters(dists, clusterMap, relativeAbundance);
613         
614         if (m->control_pressed) { return 0.0; }
615         
616         double allMeanDist = rMedoid(relativeAbundance, dists);
617         
618         if (m->debug) { m->mothurOut("[DEBUG]: allMeandDist = " + toString(allMeanDist) + "\n"); }
619         
620         for (int i = 0; i < relativeAbundance.size(); i++) {//numSamples
621             for (int j = 0; j < relativeAbundance[i].size(); j++) { //numOtus
622                 if (m->control_pressed) {  return 0; }
623                 //x <- (x - centers[cl, ])^2
624                 relativeAbundance[i][j] = ((relativeAbundance[i][j] - centers[clusterMap[i]][j])*(relativeAbundance[i][j] - centers[clusterMap[i]][j]));
625             }
626         }
627         
628         double wgss = 0.0;
629         for (int j = 0; j < numOTUs; j++) {
630             for(int i=0;i<numSamples;i++){
631                 if (m->control_pressed) { return 0.0; }
632                 wgss += relativeAbundance[i][j];
633             }
634         }
635         
636         double bgss = allMeanDist - wgss;
637         
638         CH = (bgss / (double)(numPartitions - 1)) / (wgss / (double) (numSamples - numPartitions));
639         
640         return CH;
641     }
642     catch(exception& e){
643         m->errorOut(e, "CommunityTypeFinder", "calcCHIndex");
644         exit(1);
645     }
646 }
647
648
649 /**************************************************************************************************/
650 vector<vector<double> > CommunityTypeFinder::calcCenters(vector<vector<double> >& dists, map<int, int> clusterMap, vector<vector<double> >& relativeAbundance) { //[numsamples][numsamples]
651     try {
652         //for each partition
653         //choose sample with smallest sum of squared dists
654         //       cout << "Here" << clusterMap.size() << endl;
655         //       for(map<int, int>::iterator it = clusterMap.begin(); it != clusterMap.end(); it++) { cout << it->first << '\t' << it->second <<endl; }
656         vector<vector<double> > centers; centers.resize(numPartitions);
657         vector<double>  sums;  sums.resize(numSamples, 0.0);
658         map<int, vector<int> > partition2Samples; //maps partitions to samples in the partition
659         map<int, vector<int> >::iterator it;
660         
661         for (int i = 0; i < numSamples; i++) {
662             int partitionI = clusterMap[i];
663             
664             //add this sample to list of samples in this partition for access later
665             it = partition2Samples.find(partitionI);
666             if (it == partition2Samples.end()) {
667                 vector<int> temp; temp.push_back(i);
668                 partition2Samples[partitionI] = temp;
669             }else {  partition2Samples[partitionI].push_back(i); }
670             
671             for (int j = 0; j < numSamples; j++) {
672                 
673                 int partitionJ = clusterMap[j];
674                 
675                 if (partitionI == partitionJ) { //if you are a distance between samples in the same cluster
676                     sums[i] += dists[i][j];
677                     sums[j] += dists[i][j];
678                 }else{}//we dont' care about distance between clusters
679             }
680         }
681         
682         vector<int> medoidsVector; medoidsVector.resize(numPartitions, -1);
683         for (it = partition2Samples.begin(); it != partition2Samples.end(); it++) { //for each partition look for sample with smallest squared
684             
685             //sum dist to all other samples in cluster
686             vector<int> members = it->second;
687             double minSumDist = 1e6;
688             for (int i = 0; i < members.size(); i++) {
689                 if (m->control_pressed) { return centers; }
690                 if (sums[members[i]] < minSumDist) {
691                     minSumDist = sums[members[i]];
692                     medoidsVector[it->first] = members[i];
693                 }
694             }
695             
696         }
697         
698         set<int> medoids;
699         for (int i = 0; i < medoidsVector.size(); i++) {
700             medoids.insert(medoidsVector[i]);
701         }
702         
703         int countPartitions = 0;
704         for (set<int>::iterator it = medoids.begin(); it != medoids.end(); it++) {
705             for (int j = 0; j < numOTUs; j++) {
706                 centers[countPartitions].push_back(relativeAbundance[*it][j]); //save the relative abundance of the medoid for this partition for this OTU
707             }
708             countPartitions++;
709         }
710         
711         return centers;
712     }
713     catch(exception& e){
714         m->errorOut(e, "CommunityTypeFinder", "calcCenters");
715         exit(1);
716     }
717 }
718
719 /**************************************************************************************************/
720 //The silhouette width S(i)of individual data points i is calculated using the following formula:
721 /*
722  s(i) = b(i) - a(i)
723  -----------
724  max(b(i),a(i))
725  where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster.
726  
727  The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster.
728  */
729 //based on silouette.r which calls sildist.c written by Francois Romain
730 vector<double> CommunityTypeFinder::calcSilhouettes(vector<vector<double> > dists) {
731     try {
732         vector<double> silhouettes; silhouettes.resize(numSamples, 0.0);
733         if (numPartitions < 2) { return silhouettes; }
734         
735         
736         map<int, int> clusterMap; //map sample to partition
737         for (int j = 0; j < numSamples; j++) {
738             double maxValue = 0.0;
739             for (int i = 0; i < numPartitions; i++) {
740                 if (m->control_pressed) { return silhouettes; }
741                 if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be
742                     clusterMap[j] = i;
743                     maxValue = zMatrix[i][j];
744                 }
745             }
746         }
747         
748         //count number of samples in each partition
749         vector<int> counts; counts.resize(numPartitions, 0);
750         vector<double> DiC; DiC.resize((numPartitions*numSamples), 0.0);
751         bool computeSi = true;
752         
753         for (int i = 0; i < numSamples; i++) {
754             int partitionI = clusterMap[i];
755             counts[partitionI]++;
756             
757             for (int j = i+1; j < numSamples; j++) {
758                 if (m->control_pressed) { return silhouettes; }
759                 int partitionJ = clusterMap[j];
760                 
761                 DiC[numPartitions*i+partitionJ] += dists[i][j];
762                 DiC[numPartitions*j+partitionI] += dists[i][j];
763             }
764         }
765         
766         vector<int> neighbor; neighbor.resize(numSamples, -1);
767         for (int i = 0; i < numSamples; i++) {
768             if (m->control_pressed) { return silhouettes; }
769             int ki = numPartitions*i;
770             int partitionI = clusterMap[i];
771             computeSi = true;
772             
773             for (int j = 0; j < numPartitions; j++) {
774                 if (j == partitionI) {
775                     if (counts[j] == 1) { //only one sample in cluster
776                         computeSi = false;
777                     }else { DiC[ki+j] /= (counts[j]-1); }
778                 }else{
779                     DiC[ki+j] /= counts[j];
780                 }
781             }
782             
783             double ai = DiC[ki+partitionI];
784             
785             double bi = 0.0;
786             if (partitionI == 0) {  bi = DiC[ki+1]; neighbor[i] = 2; }
787             else {  bi =  DiC[ki]; neighbor[i] = 1; }
788             
789             for (int j = 1; j < numPartitions; j++) {
790                 if (j != partitionI) {
791                     if (bi > DiC[ki+j]) {
792                         bi = DiC[ki + j];
793                         neighbor[i] = j+1;
794                     }
795                 }
796             }
797             
798             silhouettes[i] = 0.0;
799             if (computeSi && bi != ai) {
800                 silhouettes[i] = (bi-ai) / (max(ai, bi));
801             }
802         }
803         
804         return silhouettes;
805     }
806     catch(exception& e) {
807         m->errorOut(e, "CommunityTypeFinder", "calcSilhouettes");
808         exit(1);
809     }
810 }
811 /**************************************************************************************************/
812