1 /* dist_dna.c 2008-01-19 */
3 /* Copyright 2005-2008 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
9 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) a & 8
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
67 void distDNA_raw(unsigned char *x, int *n, int *s, double *d)
69 int i1, i2, s1, s2, target, Nd;
72 for (i1 = 1; i1 < *n; i1++) {
73 for (i2 = i1 + 1; i2 <= *n; i2++) {
75 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
76 if (DifferentBase(x[s1], x[s2])) Nd++;
77 d[target] = ((double) Nd / *s);
83 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d)
85 int i1, i2, s1, s2, target, Nd, L;
88 for (i1 = 1; i1 < *n; i1++) {
89 for (i2 = i1 + 1; i2 <= *n; i2++) {
91 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
92 CHECK_PAIRWISE_DELETION
93 if (DifferentBase(x[s1], x[s2])) Nd++;
95 d[target] = ((double) Nd/L);
101 #define COMPUTE_DIST_JC69\
102 p = ((double) Nd/L);\
104 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
105 else d[target] = -0.75*log(1 - 4*p/3);\
107 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
108 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
111 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
112 int *variance, double *var, int *gamma, double *alpha)
114 int i1, i2, s1, s2, target, Nd, L;
120 for (i1 = 1; i1 < *n; i1++) {
121 for (i2 = i1 + 1; i2 <= *n; i2++) {
123 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
124 if (DifferentBase(x[s1], x[s2])) Nd++;
131 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
132 int *variance, double *var, int *gamma, double *alpha)
134 int i1, i2, s1, s2, target, Nd, L;
138 for (i1 = 1; i1 < *n; i1++) {
139 for (i2 = i1 + 1; i2 <= *n; i2++) {
141 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
142 CHECK_PAIRWISE_DELETION
143 if (DifferentBase(x[s1], x[s2])) Nd++;
152 if (SameBase(x[s1], x[s2])) continue;\
154 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
158 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
160 #define COMPUTE_DIST_K80\
161 P = ((double) Ns/L);\
162 Q = ((double) (Nd - Ns)/L);\
167 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
169 else d[target] = -0.5 * log(a1 * sqrt(a2));\
172 b = -(1 / *alpha + 1);\
181 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
184 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
185 int *variance, double *var, int *gamma, double *alpha)
187 int i1, i2, s1, s2, target, Nd, Ns, L;
188 double P, Q, a1, a2, b, c1, c2, c3;
193 for (i1 = 1; i1 < *n; i1++) {
194 for (i2 = i1 + 1; i2 <= *n; i2++) {
196 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
205 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
206 int *variance, double *var, int *gamma, double *alpha)
208 int i1, i2, s1, s2, target, Nd, Ns, L;
209 double P, Q, a1, a2, b, c1, c2, c3;
212 for (i1 = 1; i1 < *n; i1++) {
213 for (i2 = i1 + 1; i2 <= *n; i2++) {
215 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
216 CHECK_PAIRWISE_DELETION
225 #define COMPUTE_DIST_F81\
226 p = ((double) Nd/L);\
227 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
228 else d[target] = -E*log(1 - p/E);\
230 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
231 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
234 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
235 int *variance, double *var, int *gamma, double *alpha)
237 int i1, i2, s1, s2, target, Nd, L;
241 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
244 for (i1 = 1; i1 < *n; i1++) {
245 for (i2 = i1 + 1; i2 <= *n; i2++) {
247 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
248 if (DifferentBase(x[s1], x[s2])) Nd++;
255 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
256 int *variance, double *var, int *gamma, double *alpha)
258 int i1, i2, s1, s2, target, Nd, L;
261 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
264 for (i1 = 1; i1 < *n; i1++) {
265 for (i2 = i1 + 1; i2 <= *n; i2++) {
267 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
268 CHECK_PAIRWISE_DELETION
269 if (DifferentBase(x[s1], x[s2])) Nd++;
277 #define COUNT_TS_TV1_TV2\
278 if (SameBase(x[s1], x[s2])) continue;\
280 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
284 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
287 #define COMPUTE_DIST_K81\
288 P = ((double) (Nd - Nv1 - Nv2)/L);\
289 Q = ((double) Nv1/L);\
290 R = ((double) Nv2/L);\
294 d[target] = -0.25*log(a1*a2*a3);\
296 a = (1/a1 + 1/a2)/2;\
297 b = (1/a1 + 1/a3)/2;\
298 c = (1/a2 + 1/a3)/2;\
299 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
302 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
303 int *variance, double *var)
305 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
306 double P, Q, R, a1, a2, a3, a, b, c;
311 for (i1 = 1; i1 < *n; i1++) {
312 for (i2 = i1 + 1; i2 <= *n; i2++) {
314 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
323 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
324 int *variance, double *var)
326 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
327 double P, Q, R, a1, a2, a3, a, b, c;
330 for (i1 = 1; i1 < *n; i1++) {
331 for (i2 = i1 + 1; i2 <= *n; i2++) {
332 Nd = Nv1 = Nv2 = L = 0;
333 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
334 CHECK_PAIRWISE_DELETION
343 #define PREPARE_BF_F84\
344 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
345 B = BF[0]*BF[2] + BF[1]*BF[3];\
346 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
348 #define COMPUTE_DIST_F84\
349 P = ((double) Ns/L);\
350 Q = ((double) (Nd - Ns)/L);\
351 d[target] = -2*A*log(1 - (P/(2*A) - (A - B)*Q/(2*A*C))) + 2*(A - B - C)*log(1 - Q/(2*C));\
356 a = t1/(t1 - t2 - t3);\
357 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
358 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/2;\
361 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
362 double *BF, int *variance, double *var)
364 int i1, i2, Nd, Ns, L, target, s1, s2;
365 double P, Q, A, B, C, a, b, t1, t2, t3;
371 for (i1 = 1; i1 < *n; i1++) {
372 for (i2 = i1 + 1; i2 <= *n; i2++) {
374 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
383 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
384 double *BF, int *variance, double *var)
386 int i1, i2, Nd, Ns, L, target, s1, s2;
387 double P, Q, A, B, C, a, b, t1, t2, t3;
392 for (i1 = 1; i1 < *n; i1++) {
393 for (i2 = i1 + 1; i2 <= *n; i2++) {
395 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
396 CHECK_PAIRWISE_DELETION
405 #define COMPUTE_DIST_T92\
406 P = ((double) Ns/L);\
407 Q = ((double) (Nd - Ns)/L);\
410 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
414 c3 = wg*(c1 - c2) + c2;\
415 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
418 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
419 double *BF, int *variance, double *var)
421 int i1, i2, Nd, Ns, L, target, s1, s2;
422 double P, Q, wg, a1, a2, c1, c2, c3;
425 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
428 for (i1 = 1; i1 < *n; i1++) {
429 for (i2 = i1 + 1; i2 <= *n; i2++) {
431 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
440 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
441 double *BF, int *variance, double *var)
443 int i1, i2, Nd, Ns, L, target, s1, s2;
444 double P, Q, wg, a1, a2, c1, c2, c3;
446 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
449 for (i1 = 1; i1 < *n; i1++) {
450 for (i2 = i1 + 1; i2 <= *n; i2++) {
452 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
453 CHECK_PAIRWISE_DELETION
462 /* returns 1 if one of the base is adenine and
463 the other one is guanine surely, 0 otherwise */
464 #define AdenineAndGuanine(a, b) (a | b) == 200
466 /* returns 1 if one of the base is cytosine and
467 the other one is thymine surely, 0 otherwise */
468 #define CytosineAndThymine(a, b) (a | b) == 56
470 #define PREPARE_BF_TN93\
473 k1 = 2 * BF[0] * BF[2] / gR;\
474 k2 = 2 * BF[1] * BF[3] / gY;\
475 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
477 #define COUNT_TS1_TS2_TV\
478 if (DifferentBase(x[s1], x[s2])) {\
480 if (AdenineAndGuanine(x[s1], x[s2])) {\
484 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
487 #define COMPUTE_DIST_TN93\
488 P1 = ((double) Ns1/L);\
489 P2 = ((double) Ns2/L);\
490 Q = ((double) (Nd - Ns1 - Ns2)/L);\
491 w1 = 1 - P1/k1 - Q/(2*gR);\
492 w2 = 1 - P2/k2 - Q/(2*gY);\
493 w3 = 1 - Q/(2*gR*gY);\
495 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
500 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
501 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
503 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
507 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
508 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
511 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
513 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
514 double *BF, int *variance, double *var,
515 int *gamma, double *alpha)
517 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
518 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
525 for (i1 = 1; i1 < *n; i1++) {
526 for (i2 = i1 + 1; i2 <= *n; i2++) {
528 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
537 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
538 double *BF, int *variance, double *var,
539 int *gamma, double *alpha)
541 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
542 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
547 for (i1 = 1; i1 < *n; i1++) {
548 for (i2 = i1 + 1; i2 <= *n; i2++) {
549 Nd = Ns1 = Ns2 = L = 0;
550 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
551 CHECK_PAIRWISE_DELETION
560 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
561 int *variance, double *var)
563 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
564 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
571 npair = *n * (*n - 1) / 2;
573 theta = (double*)R_alloc(*n, sizeof(double));
574 P = (double*)R_alloc(npair, sizeof(double));
575 Q = (double*)R_alloc(npair, sizeof(double));
576 tstvr = (double*)R_alloc(npair, sizeof(double));
578 /* get the proportion of GC (= theta) in each sequence */
579 for (i1 = 1; i1 <= *n; i1++) {
581 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
582 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
583 theta[i1 - 1] = ((double) GC / *s);
586 /* get the proportions of transitions and transversions,
587 and the estimates of their ratio for each pair */
589 for (i1 = 1; i1 < *n; i1++) {
590 for (i2 = i1 + 1; i2 <= *n; i2++) {
592 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
595 P[target] = ((double) Ns / *s);
596 Q[target] = ((double) (Nd - Ns) / *s);
597 A = log(1 - 2*Q[target]);
598 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
603 /* compute the mean alpha (ma) = mean Ts/Tv */
606 for (i1 = 0; i1 < npair; i1++)
607 /* some values of tstvr are -Inf if there is no
608 transversions observed: we exclude them */
609 if (R_FINITE(tstvr[i1])) {
615 /* compute the distance for each pair */
617 for (i1 = 1; i1 < *n; i1++) {
618 for (i2 = i1 + 1; i2 <= *n; i2++) {
620 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
621 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
622 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
624 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
630 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
631 int *variance, double *var)
633 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
634 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
642 npair = *n * (*n - 1) / 2;
644 theta = (double*)R_alloc(*n, sizeof(double));
645 L = (int*)R_alloc(npair, sizeof(int));
646 P = (double*)R_alloc(npair, sizeof(double));
647 Q = (double*)R_alloc(npair, sizeof(double));
648 tstvr = (double*)R_alloc(npair, sizeof(double));
650 /* get the proportion of GC (= theta) in each sequence */
651 for (i1 = 1; i1 <= *n; i1++) {
653 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
654 if (KnownBase(x[s1])) tl++;
656 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
658 theta[i1 - 1] = ((double) GC / tl);
661 /* get the proportions of transitions and transversions,
662 and the estimates of their ratio for each pair; we
663 also get the sample size for each pair in L */
665 for (i1 = 1; i1 < *n; i1++) {
666 for (i2 = i1 + 1; i2 <= *n; i2++) {
667 Nd = Ns = L[target] = 0;
668 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
669 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
673 P[target] = ((double) Ns/L[target]);
674 Q[target] = ((double) (Nd - Ns)/L[target]);
675 A = log(1 - 2*Q[target]);
676 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
681 /* compute the mean alpha (ma) = mean Ts/Tv */
684 for (i1 = 0; i1 < npair; i1++)
685 /* some values of tstvr are -Inf if there is no
686 transversions observed: we exclude them */
687 if (R_FINITE(tstvr[i1])) {
693 /* compute the distance for each pair */
695 for (i1 = 1; i1 < *n; i1++) {
696 for (i2 = i1 + 1; i2 <= *n; i2++) {
698 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
699 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
700 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
702 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
708 #define DO_CONTINGENCY_NUCLEOTIDES\
710 case 136 : m = 0; break;\
711 case 72 : m = 1; break;\
712 case 40 : m = 2; break;\
713 case 24 : m = 3; break;\
716 case 72 : m += 4; break;\
717 case 40 : m += 8; break;\
718 case 24 : m += 12; break;\
722 #define COMPUTE_DIST_LogDet\
723 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
724 d[target] = (-log(detFourByFour(Ftab))/4 - LN4)/4;\
726 /* For the inversion, we first make U an identity matrix */\
727 for (k = 1; k < 15; k++) U[k] = 0;\
728 U[0] = U[5] = U[10] = U[15] = 1;\
729 /* The matrix is not symmetric, so we use 'dgesv'. */\
730 /* This subroutine puts the result in U. */\
731 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
732 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
733 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
734 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
735 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
736 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
737 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
738 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
739 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
742 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
743 int *variance, double *var)
745 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
746 double Ftab[16], U[16];
751 for (i1 = 1; i1 < *n; i1++) {
752 for (i2 = i1 + 1; i2 <= *n; i2++) {
753 for (k = 0; k < 16; k++) Ntab[k] = 0;
754 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
755 DO_CONTINGENCY_NUCLEOTIDES
763 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
764 int *variance, double *var)
766 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
767 double Ftab[16], U[16];
770 for (i1 = 1; i1 < *n; i1++) {
771 for (i2 = i1 + 1; i2 <= *n; i2++) {
772 for (k = 0; k < 16; k++) Ntab[k] = 0;
774 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
775 CHECK_PAIRWISE_DELETION
776 DO_CONTINGENCY_NUCLEOTIDES
784 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
785 int *variance, double *var)
787 For the moment there is no need to check for pairwise deletions
788 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
789 In effect the pairwise deletion has possibly been done before.
790 The sequence length(s) are used only to compute the variances, which is
791 currently not available.
794 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4], ndim = 4, info, ipiv[16];
795 double P12[16], P21[16], U[16];
797 for (i1 = 1; i1 < *n; i1++) {
798 for (i2 = i1 + 1; i2 <= *n; i2++) {
799 for (k = 0; k < 16; k++) Ntab[k] = 0;
800 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
801 DO_CONTINGENCY_NUCLEOTIDES
804 /* get the rowwise sums of Ntab */
805 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
806 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
807 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
808 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
810 for (k = 0; k < 16; k++)
811 P12[k] = ((double) Ntab[k]);
813 /* scale each element of P12 by its rowwise sum */
814 for (k = 0; k < 4; k++)
815 for (kb = 0; kb < 16; kb += 4)
816 P12[k + kb] = P12[k + kb]/ROWsums[k];
818 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
820 /* compute the columnwise sums of Ntab: these
821 are the rowwise sums of its transpose */
822 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
823 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
824 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
825 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
827 /* transpose Ntab and store the result in P21 */
828 for (k = 0; k < 4; k++)
829 for (kb = 0; kb < 4; kb++)
830 P21[kb + 4*k] = Ntab[k + 4*kb];
833 for (k = 0; k < 4; k++)
834 for (kb = 0; kb < 16; kb += 4)
835 P21[k + kb] = P21[k + kb]/ROWsums[k];
837 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
842 #define COMPUTE_DIST_ParaLin\
843 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
844 d[target] = -log(detFourByFour(Ftab)/\
845 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
846 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
848 /* For the inversion, we first make U an identity matrix */\
849 for (k = 1; k < 15; k++) U[k] = 0;\
850 U[0] = U[5] = U[10] = U[15] = 1;\
851 /* The matrix is not symmetric, so we use 'dgesv'. */\
852 /* This subroutine puts the result in U. */\
853 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
854 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
855 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
856 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
857 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
858 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
859 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
860 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
861 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
862 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
863 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
864 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
865 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
868 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
869 int *variance, double *var)
871 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
872 double Ftab[16], U[16], *find[4];
876 for (k = 0; k < 4; k++)
877 find[k] = (double*)R_alloc(*n, sizeof(double));
879 for (i1 = 0; i1 < *n; i1++)
880 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
882 for (i1 = 0; i1 < *n; i1++) {
883 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
885 case 136 : find[0][i1]++; break;
886 case 40 : find[1][i1]++; break;
887 case 72 : find[2][i1]++; break;
888 case 24 : find[3][i1]++; break;
891 for (k = 0; k < 4; k++) find[k][i1] /= L;
895 for (i1 = 1; i1 < *n; i1++) {
896 for (i2 = i1 + 1; i2 <= *n; i2++) {
897 for (k = 0; k < 16; k++) Ntab[k] = 0;
898 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
899 DO_CONTINGENCY_NUCLEOTIDES
907 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
908 int *variance, double *var)
910 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
911 double Ftab[16], U[16], *find[4];
915 for (k = 0; k < 4; k++)
916 find[k] = (double*)R_alloc(*n, sizeof(double));
918 for (i1 = 0; i1 < *n; i1++)
919 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
921 for (i1 = 0; i1 < *n; i1++) {
923 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
924 if (KnownBase(x[s1])) {
927 case 136 : find[0][i1]++; break;
928 case 40 : find[1][i1]++; break;
929 case 72 : find[2][i1]++; break;
930 case 24 : find[3][i1]++; break;
934 for (k = 0; k < 4; k++) find[k][i1] /= L;
938 for (i1 = 1; i1 < *n; i1++) {
939 for (i2 = i1 + 1; i2 <= *n; i2++) {
941 for (k = 0; k < 16; k++) Ntab[k] = 0;
942 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
943 CHECK_PAIRWISE_DELETION
944 DO_CONTINGENCY_NUCLEOTIDES
952 void BaseProportion(unsigned char *x, int *n, double *BF)
957 for (i = 0; i < *n; i++) {
958 if (KnownBase(x[i])) {
961 case 136 : BF[0]++; break;
962 case 40 : BF[1]++; break;
963 case 72 : BF[2]++; break;
964 case 24 : BF[3]++; break;
968 for (i = 0; i < 4; i++) BF[i] /= m;
971 void SegSites(unsigned char *x, int *n, int *s, int *seg)
976 for (j = 0; j < *s; j++) {
978 while (!KnownBase(x[i])) i++;
981 while (i < *n * (j + 1)) {
982 if (x[i] == basis) i++;
991 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
995 for (j = 0; j < *s; j++) {
997 while (i < *n * (j + 1)) {
998 if (KnownBase(x[i])) i++;
1007 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1008 double *BF, int *pairdel, int *variance, double *var,
1009 int *gamma, double *alpha)
1012 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d);
1013 else distDNA_raw(x, n, s, d); break;
1015 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1016 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1018 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1019 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1021 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1022 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1024 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1025 else distDNA_K81(x, n, s, d, variance, var); break;
1027 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1028 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1030 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1031 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1033 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1034 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1036 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1037 else distDNA_GG95(x, n, s, d, variance, var); break;
1039 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1040 else distDNA_LogDet(x, n, s, d, variance, var); break;
1042 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1044 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1045 else distDNA_ParaLin(x, n, s, d, variance, var); break;