5 * Created by westcott on 7/6/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "mothurmetastats.h"
11 #include "mothurfisher.h"
14 /***********************************************************/
15 MothurMetastats::MothurMetastats(double t, int n) {
17 m = MothurOut::getInstance();
21 }catch(exception& e) {
22 m->errorOut(e, "MothurMetastats", "MothurMetastats");
26 /***********************************************************/
27 MothurMetastats::~MothurMetastats() {}
28 /***********************************************************/
29 //main metastats function
30 int MothurMetastats::runMetastats(string outputFileName, vector< vector<double> >& data, int secGroupingStart) {
32 row = data.size(); //numBins
33 column = data[0].size(); //numGroups in subset
34 secondGroupingStart = secGroupingStart; //g number of samples in group 1
36 vector< vector<double> > Pmatrix; Pmatrix.resize(row);
37 for (int i = 0; i < row; i++) { Pmatrix[i].resize(column, 0.0); } // the relative proportion matrix
38 vector< vector<double> > C1; C1.resize(row);
39 for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
40 vector< vector<double> > C2; C2.resize(row); // mean[1], variance[2], standard error[3]
41 for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); }
42 vector<double> T_statistics; T_statistics.resize(row, 1); // a place to store the true t-statistics
43 vector<double> pvalues; pvalues.resize(row, 1); // place to store pvalues
44 vector<double> qvalues; qvalues.resize(row, 1); // stores qvalues
46 //*************************************
47 // convert to proportions
49 //*************************************
50 vector<double> totals; totals.resize(column, 0); // sum of columns
51 //total[i] = total abundance for group[i]
52 for (int i = 0; i < column; i++) {
53 for (int j = 0; j < row; j++) {
54 totals[i] += data[j][i];
58 for (int i = 0; i < column; i++) {
59 for (int j = 0; j < row; j++) {
60 Pmatrix[j][i] = data[j][i]/totals[i];
64 //#********************************************************************************
65 //# ************************** STATISTICAL TESTING ********************************
66 //#********************************************************************************
68 if (column == 2){ //# then we have a two sample comparison
69 //#************************************************************
70 //# generate p values fisher's exact test
71 //#************************************************************
72 double total1, total2; total1 = 0; total2 = 0;
73 //total for first grouping
74 for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
76 //total for second grouping
77 for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; }
79 vector<double> fish; fish.resize(row, 0.0);
80 vector<double> fish2; fish2.resize(row, 0.0);
82 for(int i = 0; i < row; i++){
84 for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
85 for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; }
87 double f11, f12, f21, f22;
90 f21 = total1 - fish[i];
91 f22 = total2 - fish2[i];
94 double pre = fisher.fexact(f11, f12, f21, f22);
95 if (pre > 0.999999999) { pre = 1.0; }
97 if (m->control_pressed) { return 1; }
102 //#*************************************
103 //# calculate q values from p values
104 //#*************************************
105 qvalues = calc_qvalues(pvalues);
107 }else { //we have multiple subjects per population
109 //#*************************************
110 //# generate statistics mean, var, stderr
111 //#*************************************
112 for(int i = 0; i < row; i++){ // for each taxa
113 //# find the mean of each group
114 double g1Total = 0.0; double g2Total = 0.0;
115 for (int j = 0; j < secondGroupingStart; j++) { g1Total += Pmatrix[i][j]; }
116 C1[i][0] = g1Total/(double)(secondGroupingStart);
117 for (int j = secondGroupingStart; j < column; j++) { g2Total += Pmatrix[i][j]; }
118 C2[i][0] = g2Total/(double)(column-secondGroupingStart);
120 //# find the variance of each group
121 double g1Var = 0.0; double g2Var = 0.0;
122 for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((Pmatrix[i][j]-C1[i][0]), 2); }
123 C1[i][1] = g1Var/(double)(secondGroupingStart-1);
124 for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((Pmatrix[i][j]-C2[i][0]), 2); }
125 C2[i][1] = g2Var/(double)(column-secondGroupingStart-1);
127 //# find the std error of each group -std err^2 (will change to std err at end)
128 C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
129 C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart);
132 //#*************************************
133 //# two sample t-statistics
134 //#*************************************
135 for(int i = 0; i < row; i++){ // # for each taxa
136 double xbar_diff = C1[i][0] - C2[i][0];
137 double denom = sqrt(C1[i][2] + C2[i][2]);
138 T_statistics[i] = xbar_diff/denom; // calculate two sample t-statistic
141 /*for (int i = 0; i < row; i++) {
142 for (int j = 0; j < 3; j++) {
143 cout << "C1[" << i+1 << "," << j+1 << "]=" << C1[i][j] << ";" << endl;
144 cout << "C2[" << i+1 << "," << j+1 << "]=" << C2[i][j] << ";" << endl;
146 cout << "T_statistics[" << i+1 << "]=" << T_statistics[i] << ";" << endl;
149 for (int i = 0; i < row; i++) {
150 for (int j = 0; j < column; j++) {
151 cout << "Fmatrix[" << i+1 << "," << j+1 << "]=" << data[i][j] << ";" << endl;
155 //#*************************************
156 //# generate initial permuted p-values
157 //#*************************************
158 pvalues = permuted_pvalues(Pmatrix, T_statistics, data);
160 //#*************************************
161 //# generate p values for sparse data
162 //# using fisher's exact test
163 //#*************************************
164 double total1, total2; total1 = 0; total2 = 0;
165 //total for first grouping
166 for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
168 //total for second grouping
169 for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; }
171 vector<double> fish; fish.resize(row, 0.0);
172 vector<double> fish2; fish2.resize(row, 0.0);
174 for(int i = 0; i < row; i++){
176 for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
177 for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; }
179 if ((fish[i] < secondGroupingStart) && (fish2[i] < (column-secondGroupingStart))) {
181 double f11, f12, f21, f22;
184 f21 = total1 - fish[i];
185 f22 = total2 - fish2[i];
188 double pre = fisher.fexact(f11, f12, f21, f22);
189 if (pre > 0.999999999) { pre = 1.0; }
191 if (m->control_pressed) { return 1; }
197 //#*************************************
198 //# calculate q values from p values
199 //#*************************************
200 qvalues = calc_qvalues(pvalues);
202 //#*************************************
203 //# convert stderr^2 to std error
204 //#*************************************
205 for(int i = 0; i < row; i++){
206 C1[i][2] = sqrt(C1[i][2]);
207 C2[i][2] = sqrt(C2[i][2]);
211 // And now we write the files to a text file.
213 time_t t; t = time(NULL);
214 local = localtime(&t);
217 m->openOutputFile(outputFileName, out);
218 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
220 out << "Local time and date of test: " << asctime(local) << endl;
221 out << "# rows = " << row << ", # col = " << column << ", g = " << secondGroupingStart << endl << endl;
222 out << numPermutations << " permutations" << endl << endl;
224 //output column headings - not really sure... documentation labels 9 columns, there are 10 in the output file
225 //storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
226 out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tp-value\tq-value\n";
228 for(int i = 0; i < row; i++){
229 if (m->control_pressed) { out.close(); return 0; }
231 //if there are binlabels use them otherwise count.
232 if (i < m->currentSharedBinLabels.size()) { out << m->currentSharedBinLabels[i] << '\t'; }
233 else { out << (i+1) << '\t'; }
235 out << C1[i][0] << '\t' << C1[i][1] << '\t' << C1[i][2] << '\t' << C2[i][0] << '\t' << C2[i][1] << '\t' << C2[i][2] << '\t' << pvalues[i] << '\t' << qvalues[i] << endl;
245 }catch(exception& e) {
246 m->errorOut(e, "MothurMetastats", "runMetastats");
250 /***********************************************************/
251 vector<double> MothurMetastats::permuted_pvalues(vector< vector<double> >& Imatrix, vector<double>& tstats, vector< vector<double> >& Fmatrix) {
253 //# matrix stores tstats for each taxa(row) for each permuted trial(column)
254 vector<double> ps; ps.resize(row, 0.0); //# to store the pvalues
255 vector< vector<double> > permuted_ttests; permuted_ttests.resize(numPermutations);
256 for (int i = 0; i < numPermutations; i++) { permuted_ttests[i].resize(row, 0.0); }
258 //# calculate null version of tstats using B permutations.
259 for (int i = 0; i < numPermutations; i++) {
260 permuted_ttests[i] = permute_and_calc_ts(Imatrix);
263 //# calculate each pvalue using the null ts
264 if ((secondGroupingStart) < 8 || (column-secondGroupingStart) < 8){
265 vector< vector<double> > cleanedpermuted_ttests; cleanedpermuted_ttests.resize(numPermutations); //# the array pooling just the frequently observed ts
266 //# then pool the t's together!
267 //# count how many high freq taxa there are
269 for (int i = 0; i < row; i++) { // # for each taxa
270 double group1Total = 0.0; double group2Total = 0.0;
271 for(int j = 0; j < secondGroupingStart; j++) { group1Total += Fmatrix[i][j]; }
272 for(int j = secondGroupingStart; j < column; j++) { group2Total += Fmatrix[i][j]; }
274 if (group1Total >= secondGroupingStart || group2Total >= (column-secondGroupingStart)){
276 for (int j = 0; j < numPermutations; j++) { cleanedpermuted_ttests[j].push_back(permuted_ttests[j][i]); }
281 for (int i = 0; i < row; i++) {
282 //number of cleanedpermuted_ttests greater than tstat[i]
284 for (int j = 0; j < numPermutations; j++) {
285 for (int k = 0; k < hfc; k++) {
286 if (cleanedpermuted_ttests[j][k] > abs(tstats[i])) { numGreater++; }
290 ps[i] = (1/(double)(numPermutations*hfc))*numGreater;
293 for (int i = 0; i < row; i++) {
294 //number of permuted_ttests[i] greater than tstat[i] //(sum(permuted_ttests[i,] > abs(tstats[i]))+1)
296 for (int j = 0; j < numPermutations; j++) { if (permuted_ttests[j][i] > abs(tstats[i])) { numGreater++; } }
297 ps[i] = (1/(double)(numPermutations+1))*numGreater;
303 }catch(exception& e) {
304 m->errorOut(e, "MothurMetastats", "permuted_pvalues");
308 /***********************************************************/
309 vector<double> MothurMetastats::permute_and_calc_ts(vector< vector<double> >& Imatrix) {
311 vector< vector<double> > permutedMatrix = Imatrix;
313 //randomize columns, ie group abundances.
314 map<int, int> randomMap;
316 for (int i = 0; i < column; i++) { randoms.push_back(i); }
317 random_shuffle(randoms.begin(), randoms.end());
318 for (int i = 0; i < randoms.size(); i++) { randomMap[i] = randoms[i]; }
321 vector< vector<double> > C1; C1.resize(row);
322 for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
323 vector< vector<double> > C2; C2.resize(row); // mean[1], variance[2], standard error[3]
324 for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); }
325 vector<double> Ts; Ts.resize(row, 0.0); // a place to store the true t-statistics
327 //#*************************************
328 //# generate statistics mean, var, stderr
329 //#*************************************
330 for(int i = 0; i < row; i++){ // for each taxa
331 //# find the mean of each group
332 double g1Total = 0.0; double g2Total = 0.0;
333 for (int j = 0; j < secondGroupingStart; j++) { g1Total += permutedMatrix[i][randomMap[j]]; }
334 C1[i][0] = g1Total/(double)(secondGroupingStart);
335 for (int j = secondGroupingStart; j < column; j++) { g2Total += permutedMatrix[i][randomMap[j]]; }
336 C2[i][0] = g2Total/(double)(column-secondGroupingStart);
338 //# find the variance of each group
339 double g1Var = 0.0; double g2Var = 0.0;
340 for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((permutedMatrix[i][randomMap[j]]-C1[i][0]), 2); }
341 C1[i][1] = g1Var/(double)(secondGroupingStart-1);
342 for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((permutedMatrix[i][randomMap[j]]-C2[i][0]), 2); }
343 C2[i][1] = g2Var/(double)(column-secondGroupingStart-1);
345 //# find the std error of each group -std err^2 (will change to std err at end)
346 C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
347 C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart);
352 //#*************************************
353 //# two sample t-statistics
354 //#*************************************
355 for(int i = 0; i < row; i++){ // # for each taxa
356 double xbar_diff = C1[i][0] - C2[i][0];
357 double denom = sqrt(C1[i][2] + C2[i][2]);
358 Ts[i] = abs(xbar_diff/denom); // calculate two sample t-statistic
364 }catch(exception& e) {
365 m->errorOut(e, "MothurMetastats", "permuted_ttests");
369 /***********************************************************/
370 vector<double> MothurMetastats::calc_qvalues(vector<double>& pValues) {
372 int numRows = pValues.size();
373 vector<double> qvalues(numRows, 0.0);
375 //fill lambdas with 0.00, 0.01, 0.02... 0.95
376 vector<double> lambdas(96, 0);
377 for (int i = 1; i < lambdas.size(); i++) { lambdas[i] = lambdas[i-1] + 0.01; }
379 vector<double> pi0_hat(lambdas.size(), 0);
382 for (int l = 0; l < lambdas.size(); l++){ // for each lambda value
384 for (int i = 0; i < numRows; i++){ // for each p-value in order
385 if (pValues[i] > lambdas[l]){ count++; }
387 pi0_hat[l] = count/(double)(numRows*(1.0-lambdas[l]));
388 //cout << lambdas[l] << '\t' << count << '\t' << numRows*(1.0-lambdas[l]) << '\t' << pi0_hat[l] << endl;
391 double pi0 = smoothSpline(lambdas, pi0_hat, 3);
394 vector<double> ordered_qs = qvalues;
395 vector<int> ordered_ps(pValues.size(), 0);
396 for (int i = 1; i < ordered_ps.size(); i++) { ordered_ps[i] = ordered_ps[i-1] + 1; }
397 vector<double> tempPvalues = pValues;
398 OrderPValues(0, numRows-1, tempPvalues, ordered_ps);
400 ordered_qs[numRows-1] = min((pValues[ordered_ps[numRows-1]]*pi0), 1.0);
401 for (int i = (numRows-2); i >= 0; i--){
402 double p = pValues[ordered_ps[i]];
403 double temp = p*numRows*pi0/(double)(i+1);
405 ordered_qs[i] = min(temp, ordered_qs[i+1]);
408 //re-distribute calculated qvalues to appropriate rows
409 for (int i = 0; i < numRows; i++){
410 qvalues[ordered_ps[i]] = ordered_qs[i];
415 }catch(exception& e) {
416 m->errorOut(e, "MothurMetastats", "calc_qvalues");
420 /***********************************************************/
421 int MothurMetastats::OrderPValues(int low, int high, vector<double>& p, vector<int>& order) {
427 int pivot = (low+high) / 2;
429 swapElements(low, pivot, p, order); //puts pivot in final spot
436 /* find member above ... */
437 while((i <= high) && (p[i] <= key)) { i++; }
439 /* find element below ... */
440 while((j >= low) && (p[j] > key)) { j--; }
443 swapElements(i, j, p, order);
447 swapElements(low, j, p, order);
450 OrderPValues(low, j-1, p, order);
451 OrderPValues(j+1, high, p, order);
456 }catch(exception& e) {
457 m->errorOut(e, "MothurMetastats", "OrderPValues");
461 /***********************************************************/
462 int MothurMetastats::swapElements(int i, int j, vector<double>& p, vector<int>& order) {
475 }catch(exception& e) {
476 m->errorOut(e, "MothurMetastats", "swapElements");
480 /***********************************************************/
481 double MothurMetastats::smoothSpline(vector<double>& x, vector<double>& y, int df) {
486 vector<double> w(n, 1);
487 double* xb = new double[n];
488 double* yb = new double[n];
489 double* wb = new double[n];
491 for (int i = 0; i < n; i++) {
494 yssw += (w[i] * y[i] * y[i]) - wb[i] * (yb[i] * yb[i]);
495 xb[i] = (x[i] - x[0]) / (x[n-1] - x[0]);
498 vector<double> knot = sknot1(xb, n);
499 int nk = knot.size() - 4;
501 double low = -1.5; double high = 1.5; double tol = 1e-04; double eps = 2e-08; int maxit = 500;
502 int ispar = 0; int icrit = 3; double dofoff = 3.0;
503 double penalty = 1.0;
504 int ld4 = 4; int isetup = 0; int ldnk = 1; int ier; double spar = 1.0; double crit;
506 double* knotb = new double[knot.size()];
507 double* coef1 = new double[nk];
508 double* lev1 = new double[n];
509 double* sz1 = new double[n];
510 for (int i = 0; i < knot.size(); i++) { knotb[i] = knot[i]; }
513 spline.sbart(&penalty, &dofoff, &xb[0], &yb[0], &wb[0], &yssw, &n, &knotb[0], &nk, &coef1[0], &sz1[0], &lev1[0], &crit,
514 &icrit, &spar, &ispar, &maxit, &low, &high, &tol, &eps, &isetup, &ld4, &ldnk, &ier);
516 result = coef1[nk-1];
529 }catch(exception& e) {
530 m->errorOut(e, "MothurMetastats", "smoothSpline");
534 /***********************************************************/
535 vector<double> MothurMetastats::sknot1(double* x, int n) {
537 vector<double> knots;
540 //R equivalent - rep(x[1L], 3L)
541 knots.push_back(x[0]); knots.push_back(x[0]); knots.push_back(x[0]);
543 //generate a sequence of nk equally spaced values from 1 to n. R equivalent = seq.int(1, n, length.out = nk)
544 vector<int> indexes = getSequence(0, n-1, nk);
545 for (int i = 0; i < indexes.size(); i++) { knots.push_back(x[indexes[i]]); }
547 //R equivalent - rep(x[n], 3L)
548 knots.push_back(x[n-1]); knots.push_back(x[n-1]); knots.push_back(x[n-1]);
552 }catch(exception& e) {
553 m->errorOut(e, "MothurMetastats", "sknot1");
557 /***********************************************************/
558 vector<int> MothurMetastats::getSequence(int start, int end, int length) {
560 vector<int> sequence;
561 double increment = (end-start) / (double) (length-1);
563 sequence.push_back(start);
564 for (int i = 1; i < length-1; i++) {
565 sequence.push_back(int(i*increment));
567 sequence.push_back(end);
571 }catch(exception& e) {
572 m->errorOut(e, "MothurMetastats", "getSequence");
576 /***********************************************************/
577 //not right, havent fully figured out the variable types yet...
578 int MothurMetastats::nkn(int n) {
581 if (n < 50) { return n; }
583 double a1 = log2(50);
584 double a2 = log2(100);
585 double a3 = log2(140);
586 double a4 = log2(200);
589 return (int)pow(2.0, (a1 + (a2 - a1) * (n - 50)/(float)150));
591 return (int)pow(2.0, (a2 + (a3 - a2) * (n - 200)/(float)600));
592 }else if (n < 3200) {
593 return (int)pow(2.0, (a3 + (a4 - a3) * (n - 800)/(float)2400));
595 return (int)pow((double)(200 + (n - 3200)), 0.2);
601 }catch(exception& e) {
602 m->errorOut(e, "MothurMetastats", "nkn");
606 /***********************************************************/