1 /* mlphylo.c 2008-06-18 */
3 /* Copyright 2006-2008 Emmanuel Paradis */
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
10 #include <R_ext/Applic.h>
11 #include <R_ext/Lapack.h>
53 void tQ_unbalBF(double *BF, double *P)
54 /* This function computes the rate matrix Q multiplied by
55 time t in the case of unbalanced base frequencies.
57 BF: the base frequencies
58 P: (input) the matrix of substitution rates
60 NOTE: P must already be multiplied by t */
62 P[1] *= BF[0]; P[2] *= BF[0]; P[3] *= BF[0];
63 P[4] *= BF[1]; P[6] *= BF[1]; P[7] *= BF[1];
64 P[8] *= BF[2]; P[9] *= BF[2]; P[11] *= BF[2];
65 P[12] *= BF[3]; P[13] *= BF[3]; P[14] *= BF[3];
67 P[0] = -P[4] - P[8] - P[12];
68 P[5] = -P[1] - P[9] - P[13];
69 P[10] = -P[2] - P[6] - P[14];
70 P[15] = -P[3] - P[7] - P[11];
73 void mat_expo4x4(double *P)
74 /* This function computes the exponential of a 4x4 matrix */
76 double U[16], vl[4], WR[4], Uinv[16], WI[4], work[32];
77 int i, info, ipiv[16], n = 4, lw = 32, ord[4];
78 char yes = 'V', no = 'N';
80 /* The matrix is not symmetric, so we use 'dgeev'. */
81 /* We take the real part of the eigenvalues -> WR */
82 /* and the right eigenvectors (vr) -> U */
83 F77_CALL(dgeev)(&no, &yes, &n, P, &n, WR, WI, vl, &n,
84 U, &n, work, &lw, &info);
86 /* It is not necessary to sort the eigenvalues... */
88 for (i = 0; i < 16; i++) P[i] = U[i];
90 /* For the inversion, we first make Uinv an identity matrix */
91 for (i = 1; i < 15; i++) Uinv[i] = 0;
92 Uinv[0] = Uinv[5] = Uinv[10] = Uinv[15] = 1;
94 /* The matrix is not symmetric, so we use 'dgesv'. */
95 /* This subroutine puts the result in Uinv (B) */
96 /* (P [= U] is erased) */
97 F77_CALL(dgesv)(&n, &n, P, &n, ipiv, Uinv, &n, &info);
99 /* The matrix product of U with the eigenvalues diagonal matrix: */
100 for (i = 0; i < 4; i++) U[i] *= exp(WR[0]);
101 for (i = 4; i < 8; i++) U[i] *= exp(WR[1]);
102 for (i = 8; i < 12; i++) U[i] *= exp(WR[2]);
103 for (i = 12; i < 16; i++) U[i] *= exp(WR[3]);
105 /* The second matrix product with U^-1 */
106 P[1] = U[1]*Uinv[0] + U[5]*Uinv[1] + U[9]*Uinv[2] + U[13]*Uinv[3];
107 P[2] = U[2]*Uinv[0] + U[6]*Uinv[1] + U[10]*Uinv[2] + U[14]*Uinv[3];
108 P[3] = U[3]*Uinv[0] + U[7]*Uinv[1] + U[11]*Uinv[2] + U[15]*Uinv[3];
109 P[4] = U[0]*Uinv[4] + U[4]*Uinv[5] + U[8]*Uinv[6] + U[12]*Uinv[7];
110 P[6] = U[2]*Uinv[4] + U[6]*Uinv[5] + U[10]*Uinv[6] + U[14]*Uinv[7];
111 P[7] = U[3]*Uinv[4] + U[7]*Uinv[5] + U[11]*Uinv[6] + U[15]*Uinv[7];
112 P[8] = U[0]*Uinv[8] + U[4]*Uinv[9] + U[8]*Uinv[10] + U[12]*Uinv[11];
113 P[9] = U[1]*Uinv[8] + U[5]*Uinv[9] + U[9]*Uinv[10] + U[13]*Uinv[11];
114 P[11] = U[3]*Uinv[8] + U[7]*Uinv[9] + U[11]*Uinv[10] + U[15]*Uinv[11];
115 P[12] = U[0]*Uinv[12] + U[4]*Uinv[13] + U[8]*Uinv[14] + U[12]*Uinv[15];
116 P[13] = U[1]*Uinv[12] + U[5]*Uinv[13] + U[9]*Uinv[14] + U[13]*Uinv[15];
117 P[14] = U[2]*Uinv[12] + U[6]*Uinv[13] + U[10]*Uinv[14] + U[14]*Uinv[15];
118 P[0] = 1 - P[4] - P[8] - P[12];
119 P[5] = 1 - P[1] - P[9] - P[13];
120 P[10] = 1 - P[2] - P[6] - P[14];
121 P[15] = 1 - P[3] - P[7] - P[11];
124 void PMAT_JC69(double t, double u, double *P)
126 P[1]=P[2]=P[3]=P[4]=P[6]=P[7]=P[8]=P[9]=P[11]=P[12]=P[13]=P[14]=(1 - exp(-4*u*t))/4;
127 P[0] = P[5] = P[10] = P[15] = 1 - 3*P[1];
130 void PMAT_K80(double t, double b, double a, double *P)
135 p = exp(-2*t/(R + 1));
137 P[1] = 0.5*(1 - p); /* A -> C */
138 P[2] = 0.25 - 0.5*exp(-t*(2*R + 1)/(R + 1)) + 0.25*p; /* A -> G */
139 P[0] = P[5] = P[10] = P[15] = 1 - 2*P[1] - P[2];
140 P[3] = P[4] = P[6] = P[11] = P[9] = P[12] = P[14] = P[1];
141 P[7] = P[8] = P[13] = P[2];
144 void PMAT_F81(double t, double u, double *BF, double *P)
149 P[0] = p + (1 - p) * BF[0]; /* A->A */
150 P[1] = P[9] = P[13] = (1 - p)*BF[1]; /* A->C, G->C, T->C */
151 P[2] = P[6] = P[14] = (1 - p)*BF[2]; /* A->G, C->G, T->G */
152 P[3] = P[7] = P[11] = (1 - p)*BF[3]; /* A->T, C->T, G->T */
153 P[4] = P[8] = P[12] = (1 - p)*BF[0]; /* C->A, G->A, T->A */
154 P[5] = p + P[1]; /* C->C */
155 P[10] = p + P[2]; /* G->G */
156 P[15] = p + P[3]; /* T->T */
159 void PMAT_F84(double t, double a, double b, double *BF, double *P)
161 double pI, pII, B, x, y;
164 pI = B * (1 - exp(-a*t)); /* probability of at least one event of type I */
165 pII = 1 - B; /* probability of at least one event of type II */
166 x = pI * (BF[0] + BF[2]);
167 y = pI * (BF[1] + BF[3]);
169 P[12] = P[4] = pII * BF[0]; /* C->A, T->A */
170 P[14] = P[6] = pII * BF[2]; /* C->G, T->G */
171 P[9] = P[1] = pII * BF[1]; /* A->C, G->C */
172 P[2] = x + P[6]; /* A->G */
173 P[11] = P[3] = pII * BF[3]; /* A->T, G->T*/
174 P[0] = 1 - P[1] - P[2] - P[3]; /* A->A */
175 P[7] = y + P[3]; /* C->T */
176 P[5] = 1 - P[4] - P[6] - P[7]; /* C->C */
177 P[8] = x + P[4]; /* G->A */
178 P[10] = 1 - P[8] - P[9] - P[11]; /* G->G */
179 P[13] = y + P[1]; /* T->C */
180 P[15] = 1 - P[12] - P[13] - P[14]; /* T->T */
183 void PMAT_HKY85(double t, double a, double b, double *BF, double *P)
185 P[2] = P[7] = P[8] = P[13] = t*a;
186 P[1] = P[3] = P[4] = P[6] = P[9] = P[11] = P[12] = P[14] = t*b;
191 void PMAT_T92(double t, double a, double b, double *BF, double *P)
193 double theta, A, B1, B2, C, x, y;
195 theta = BF[1] + BF[2];
199 C = exp(-t * ((a/b + 1)/2));
200 x = 0.5 * theta * B1;
203 P[1] = P[6] = P[9] = P[14] = 0.5 * theta * B2; /* A->C, C->G, T->G, G->C */
204 P[2] = P[13] = x - theta * C; /* A->G, T->C */
205 P[3] = P[4] = P[11] = P[12] = A * B2; /* A->T, C->A, G->T, T->A */
206 P[0] = P[15] = 1 - P[1] - P[2] - P[3]; /* A->A, T->T */
207 P[5] = P[10] = x + y; /* C->C, G->G */
208 P[7] = P[8] = A * B1 - y; /* C->T, G->A */
211 void PMAT_TN93(double t, double a1, double a2, double b,
212 double *BF, double *P)
217 A1 = (1 - exp(-a1*t)); /* transitions between purines (A <-> G) */
218 A2 = (1 - exp(-a2*t)); /* transitions between pyrimidines (C <-> T) */
221 P[1] = B * BF[1]; /* A->C */
222 P[2] = A1 * BF[2]; /* A->G */
223 P[3] = B * BF[3]; /* A->T */
224 P[0] = 1 - P[1] - P[2] - P[3]; /* A->A */
225 P[4] = B * BF[0]; /* C->A */
226 P[6] = B * BF[2]; /* C->G */
227 P[7] = A2 * BF[3] ; /* C->T */
228 P[5] = 1 - P[4] - P[6] - P[7]; /* C->C */
229 P[8] = A1 * BF[0]; /* G->A */
230 P[9] = B * BF[1]; /* G->C */
231 P[11] = B * BF[3]; /* G->T */
232 P[10] = 1 - P[8] - P[9] - P[11]; /* G->G */
233 P[12] = B * BF[0]; /* T->A */
234 P[13] = A2 * BF[1]; /* T->C */
235 P[14] = B * BF[2]; /* T->G */
236 P[15] = 1 - P[12] - P[13] - P[14]; /* T->T */
239 void PMAT_GTR(double t, double a, double b, double c, double d, double e,
240 double f, double *BF, double *P)
252 #define GET_DNA_PARAMETERS \
253 /* get the substitution parameters */ \
254 model = *(D->MOD.model); \
255 /* If the model is not JC69 or F81: */ \
256 if (model != 1 && model != 3) { \
257 for(i = 0; i < *(D->MOD.npara); i++) \
258 u[i] = D->MOD.para[i]; \
260 /* get the shape parameter and calculate the coefficients */ \
261 ncat = *(D->MOD.ncat); \
263 /* use `tmp[0]' to store the mean of the coefficients */ \
264 /* in order to rescale them */ \
266 if (*(D->MOD.nalpha) > 1) alpha = *(D->MOD.alpha); \
267 else alpha = D->MOD.alpha[k]; \
268 for (j = 0; j < ncat; j++) { \
269 coef_gamma[j] = qgamma((0.5 + j)/ncat, alpha, \
271 tmp[0] += coef_gamma[j]; \
274 for (j = 0; j < ncat; j++) \
275 coef_gamma[j] /= tmp[0]; \
276 } else coef_gamma[0] = 1.; \
277 /* get the proportion of invariants */ \
278 if (*(D->MOD.ninvar)) { \
279 if (*(D->MOD.ninvar) > 1) I = *(D->MOD.invar); \
280 else I = D->MOD.invar[k]; \
283 void getSiteLik(int n, int d, int j, int nr, DNAdata *D, double *L)
289 memset(L, 0, 4*sizeof(double));
290 if (D->X.seq[i] & 128) L[0] = 1; /* if A or else */
291 if (D->X.seq[i] & 32) L[1] = 1; /* if C or else */
292 if (D->X.seq[i] & 64) L[2] = 1; /* if G or else */
293 if (D->X.seq[i] & 16) L[3] = 1; /* if T or else */
295 i = (d - n) + j*(n - 2);
297 L[1] = D->X.anc[i + nr];
298 L[2] = D->X.anc[i + 2*nr];
299 L[3] = D->X.anc[i + 3*nr];
303 #define LOOP_THROUGH_SITES \
304 for(j = start; j <= end; j++) { \
305 memset(tmp, 0, 4*sizeof(double)); \
306 getSiteLik(n, d1, j, nr, D, L1); \
307 getSiteLik(n, d2, j, nr, D, L2); \
308 for(i = 0; i < ncat; i++) { \
310 case 1 : PMAT_JC69(l1, coef_gamma[i], P1); \
311 PMAT_JC69(l2, coef_gamma[i], P2); break; \
312 case 2 : PMAT_K80(l1, coef_gamma[i], u[0], P1); \
313 PMAT_K80(l2, coef_gamma[i], u[0], P2); break; \
314 case 3 : PMAT_F81(l1, coef_gamma[i], BF, P1); \
315 PMAT_F81(l2, coef_gamma[i], BF, P2); break; \
316 case 4 : PMAT_F84(l1, coef_gamma[i], u[0], BF, P1); \
317 PMAT_F84(l2, coef_gamma[i], u[0], BF, P2); break; \
318 case 5 : PMAT_HKY85(l1, coef_gamma[i], u[0], BF, P1); \
319 PMAT_HKY85(l2, coef_gamma[i], u[0], BF, P2); break; \
320 case 6 : PMAT_T92(l1, coef_gamma[i], u[0], BF, P1); \
321 PMAT_T92(l2, coef_gamma[i], u[0], BF, P2); break; \
322 case 7 : PMAT_TN93(l1, coef_gamma[i], u[0], u[1], BF, P1); \
323 PMAT_TN93(l2, coef_gamma[i], u[0], u[1], BF, P2); break; \
324 case 8 : PMAT_GTR(l1, coef_gamma[i], u[0], u[1], u[2], u[3], u[4], BF, P1); \
325 PMAT_GTR(l2, coef_gamma[i], u[0], u[1], u[2], u[3], u[4], BF, P2); break; \
327 tmp[0] += (L1[0]*P1[0] + L1[1]*P1[1] + L1[2]*P1[2] + L1[3]*P1[3]) * \
328 (L2[0]*P2[0] + L2[1]*P2[1] + L2[2]*P2[2] + L2[3]*P2[3]); \
329 tmp[1] += (L1[0]*P1[4] + L1[1]*P1[5] + L1[2]*P1[6] + L1[3]*P1[7]) * \
330 (L2[0]*P2[4] + L2[1]*P2[5] + L2[2]*P2[6] + L2[3]*P2[7]); \
331 tmp[2] += (L1[0]*P1[8] + L1[1]*P1[9] + L1[2]*P1[10] + L1[3]*P1[11]) * \
332 (L2[0]*P2[8] + L2[1]*P2[9] + L2[2]*P2[10] + L2[3]*P2[11]); \
333 tmp[3] += (L1[0]*P1[12] + L1[1]*P1[13] + L1[2]*P1[14] + L1[3]*P1[15]) * \
334 (L2[0]*P2[12] + L2[1]*P2[13] + L2[2]*P2[14] + L2[3]*P2[15]); \
342 if (D->MOD.ninvar) { \
344 tmp[0] = V*tmp[0] + I*L1[0]*L2[0]; \
345 tmp[1] = V*tmp[1] + I*L1[1]*L2[1]; \
346 tmp[2] = V*tmp[2] + I*L1[2]*L2[2]; \
347 tmp[3] = V*tmp[3] + I*L1[3]*L2[3]; \
349 ind = anc - n + j*(n - 2); \
350 D->X.anc[ind] = tmp[0]; \
351 D->X.anc[ind + nr] = tmp[1]; \
352 D->X.anc[ind + 2*nr] = tmp[2]; \
353 D->X.anc[ind + 3*nr] = tmp[3]; \
356 void lik_dna_node(DNAdata *D, int ie)
358 This function computes the likelihoods at a node for all
362 int d1, d2, anc, n, nr;
363 int i, j, k, start, end, ind, i1, i2, ncat, model;
364 double tmp[4], L1[4], L2[4], l1, l2, P1[16], P2[16], V, coef_gamma[10], u[6], I, alpha, *BF;
367 nr = *(D->X.s) * (n - 2);
370 d1 = D->PHY.edge2[ie];
371 d2 = D->PHY.edge2[ie + 1];
372 anc = D->PHY.edge1[ie];
374 /* change these to use them as indices */
378 l2 = D->PHY.el[ie + 1];
380 for(k = 0; k < *(D->MOD.npart); k++) {
381 start = D->MOD.partition[k*2] - 1;
382 end = D->MOD.partition[k*2 + 1] - 1;
387 l1 *= D->MOD.xi[k - 1];
388 l2 *= D->MOD.xi[k - 1];
395 void lik_dna_root(DNAdata *D)
397 This function computes the likelihoods at the root for all
401 int i, j, k, start, end, ind, ncat, model, d1, d2, n, N, nr;
402 double tmp[4], L1[4], L2[4], el, P[16], V, coef_gamma[10], u[6], I, alpha, *BF;
404 n = *(D->X.n); /* number of tips */
405 N = 2*n - 3; /* number of edges */
406 nr = *(D->X.s) * (n - 2);
409 d1 = D->PHY.edge1[N - 1]; /* it's the root */
410 d2 = D->PHY.edge2[N - 1];
412 /* change these to use them as indices */
415 el = D->PHY.el[N - 1];
417 for(k = 0; k < *(D->MOD.npart); k++) {
418 start = D->MOD.partition[k*2] - 1;
419 end = D->MOD.partition[k*2 + 1] - 1;
423 if (k > 0) el *= D->MOD.xi[k - 1];
425 for(j = start; j <= end; j++) {
426 getSiteLik(n, d1, j, nr, D, L1);
427 getSiteLik(n, d2, j, nr, D, L2);
428 memset(tmp, 0, 4*sizeof(double));
429 for(i = 0; i < ncat; i++) {
431 case 1 : PMAT_JC69(el, coef_gamma[i], P); break;
432 case 2 : PMAT_K80(el, coef_gamma[i], u[0], P); break;
433 case 3 : PMAT_F81(el, coef_gamma[i], BF, P); break;
434 case 4 : PMAT_F84(el, coef_gamma[i], u[0], BF, P); break;
435 case 5 : PMAT_HKY85(el, coef_gamma[i], u[0], BF, P); break;
436 case 6 : PMAT_T92(el, coef_gamma[i], u[0], BF, P); break;
437 case 7 : PMAT_TN93(el, coef_gamma[i], u[0], u[1], BF, P); break;
438 case 8 : PMAT_GTR(el, coef_gamma[i], u[0], u[1], u[2], u[3], u[4], BF, P); break;
440 tmp[0] += (L2[0]*P[0] + L2[1]*P[1] + L2[2]*P[2] + L2[3]*P[3]);
441 tmp[1] += (L2[0]*P[4] + L2[1]*P[5] + L2[2]*P[6] + L2[3]*P[7]);
442 tmp[2] += (L2[0]*P[8] + L2[1]*P[9] + L2[2]*P[10] + L2[3]*P[11]);
443 tmp[3] += (L2[0]*P[12] + L2[1]*P[13] + L2[2]*P[14] + L2[3]*P[15]);
451 if (*(D->MOD.ninvar)) {
453 tmp[0] = V*tmp[0] + I*L2[0]*L1[0];
454 tmp[1] = V*tmp[1] + I*L2[1]*L1[1];
455 tmp[2] = V*tmp[2] + I*L2[2]*L1[2];
456 tmp[3] = V*tmp[3] + I*L2[3]*L1[3];
459 D->X.anc[ind] = tmp[0]*L1[0];
460 D->X.anc[ind + nr] = tmp[1]*L1[1];
461 D->X.anc[ind + 2*nr] = tmp[2]*L1[2];
462 D->X.anc[ind + 3*nr] = tmp[3]*L1[3];
467 void lik_dna_tree(DNAdata *D, double *loglik)
469 int i, j, n, nnode, nsite, nr;
477 /* loop through the tree
478 We don't do the root node here, so i goes between 0 and 2n - 5 */
479 for(i = 0; i < 2*n - 4; i += 2) lik_dna_node(D, i);
481 /* We now do the root */
484 for(j = 0; j < nsite; j++) {
486 for (i = 0; i < 4; i++)
487 tmp += D->BF[i] * D->X.anc[j*nnode + i*nr];
488 *loglik += D->X.w[j]*log(tmp);
492 double fcn_mlphylo_invar(double I, info *INFO)
496 INFO->D->MOD.invar[INFO->i] = I;
497 lik_dna_tree(INFO->D, &loglik);
502 void mlphylo_invar(int N, DNAdata *D, double *loglik)
504 optimize proportion of invariants
514 for(i = 0; i < N; i++) {
516 I = Brent_fmin(0.0, 1.0,
517 (double (*)(double, void*)) fcn_mlphylo_invar,
523 double fcn_mlphylo_gamma(double a, info *INFO)
527 INFO->D->MOD.alpha[INFO->i] = a;
528 lik_dna_tree(INFO->D, &loglik);
533 void mlphylo_gamma(int N, DNAdata *D, double *loglik)
535 optimize gamma (ISV) parameters
545 for(i = 0; i < N; i++) {
547 a = Brent_fmin(0.0, 1.e4,
548 (double (*)(double, void*)) fcn_mlphylo_gamma,
554 double fcn_mlphylo_para(double p, info *INFO)
558 INFO->D->MOD.para[INFO->i] = p;
559 lik_dna_tree(INFO->D, &loglik);
564 void mlphylo_para(int N, DNAdata *D, double *loglik)
566 optimize the contrast parameter(s) xi
576 for(i = 0; i < N; i++) {
578 p = Brent_fmin(0, 1.e3,
579 (double (*)(double, void*)) fcn_mlphylo_para,
585 double fcn_mlphylo_xi(double XI, info *INFO)
589 INFO->D->MOD.xi[INFO->i] = XI;
590 lik_dna_tree(INFO->D, &loglik);
595 void mlphylo_xi(int N, DNAdata *D, double *loglik)
597 optimize the contrast parameter(s) xi
607 /* In the following, the range of the search algo was changed from */
608 /* 0-1000 to 0-100 to avoid infinite looping. (2006-04-15) */
609 /* This was changed again to 0-20. (2006-07-17) */
611 for(i = 0; i < N; i++) {
613 XI = Brent_fmin(0.0, 2.e1,
614 (double (*)(double, void*)) fcn_mlphylo_xi,
620 double fcn_mlphylo_edgelength(double l, info *INFO)
624 INFO->D->PHY.el[INFO->i] = l;
625 lik_dna_tree(INFO->D, &loglik);
630 void mlphylo_edgelength(int N, DNAdata *D, double *loglik)
632 optimize branch lengths
642 for(i = 0; i < N; i++) {
644 l = Brent_fmin(0.0, 0.1,
645 (double (*)(double, void*)) fcn_mlphylo_edgelength,
651 void mlphylo_DNAmodel(int *n, int *s, unsigned char *SEQ, double *ANC,
652 double *w, int *edge1, int *edge2,
653 double *edge_length, int *npart, int *partition,
654 int *model, double *xi, double *para, int *npara,
655 double *alpha, int *nalpha, int *ncat,
656 double *invar, int *ninvar, double *BF,
657 int *search_tree, int *fixed, double *loglik)
659 This function iterates to find the MLEs of the substitution
660 paramaters and of the branch lengths for a given tree.
673 D->PHY.edge1 = edge1;
674 D->PHY.edge2 = edge2;
675 D->PHY.el = edge_length;
677 D->MOD.npart = npart;
678 D->MOD.partition = partition;
679 D->MOD.model = model;
682 D->MOD.npara = npara;
683 D->MOD.alpha = alpha;
684 D->MOD.nalpha = nalpha;
686 D->MOD.invar = invar;
687 D->MOD.ninvar = ninvar;
691 lik_dna_tree(D, loglik);
693 if (*npart > 1) mlphylo_xi(*npart - 1, D, loglik);
694 if (*npara) mlphylo_para(*npara, D, loglik);
695 if (*nalpha) mlphylo_gamma(*nalpha, D, loglik);
696 if (*ninvar) mlphylo_invar(*ninvar, D, loglik);
697 lik_dna_tree(D, loglik);
700 } /* mlphylo_DNAmodel */
702 void jc69(double *P, double *t, double *u)
704 PMAT_JC69(*t, *u, P);
707 void k80(double *P, double *t, double *u)
709 PMAT_K80(*t, u[0], u[1], P);