1 /* dist_dna.c 2011-02-18 */
3 /* Copyright 2005-2011 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
9 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
68 if (SameBase(x[s1], x[s2])) continue;\
70 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
74 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
76 void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
78 int i1, i2, s1, s2, target, Nd, Ns;
81 for (i1 = 1; i1 < *n; i1++) {
82 for (i2 = i1 + 1; i2 <= *n; i2++) {
84 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
85 if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
88 if (Ts) d[target] = ((double) Ns); /* output number of transitions */
89 else d[target] = ((double) Nd - Ns); /* output number of transversions */
95 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
97 int i1, i2, s1, s2, target, Nd;
100 for (i1 = 1; i1 < *n; i1++) {
101 for (i2 = i1 + 1; i2 <= *n; i2++) {
103 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
104 if (DifferentBase(x[s1], x[s2])) Nd++;
105 if (scaled) d[target] = ((double) Nd / *s);
106 else d[target] = ((double) Nd);
112 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
114 int i1, i2, s1, s2, target, Nd, L;
117 for (i1 = 1; i1 < *n; i1++) {
118 for (i2 = i1 + 1; i2 <= *n; i2++) {
120 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
121 CHECK_PAIRWISE_DELETION
122 if (DifferentBase(x[s1], x[s2])) Nd++;
124 if (scaled) d[target] = ((double) Nd/L);
125 else d[target] = ((double) Nd);
131 #define COMPUTE_DIST_JC69\
132 p = ((double) Nd/L);\
134 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
135 else d[target] = -0.75*log(1 - 4*p/3);\
137 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
138 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
141 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
142 int *variance, double *var, int *gamma, double *alpha)
144 int i1, i2, s1, s2, target, Nd, L;
150 for (i1 = 1; i1 < *n; i1++) {
151 for (i2 = i1 + 1; i2 <= *n; i2++) {
153 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
154 if (DifferentBase(x[s1], x[s2])) Nd++;
161 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
162 int *variance, double *var, int *gamma, double *alpha)
164 int i1, i2, s1, s2, target, Nd, L;
168 for (i1 = 1; i1 < *n; i1++) {
169 for (i2 = i1 + 1; i2 <= *n; i2++) {
171 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
172 CHECK_PAIRWISE_DELETION
173 if (DifferentBase(x[s1], x[s2])) Nd++;
181 #define COMPUTE_DIST_K80\
182 P = ((double) Ns/L);\
183 Q = ((double) (Nd - Ns)/L);\
188 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
190 else d[target] = -0.5 * log(a1 * sqrt(a2));\
193 b = -(1 / *alpha + 1);\
202 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
205 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
206 int *variance, double *var, int *gamma, double *alpha)
208 int i1, i2, s1, s2, target, Nd, Ns, L;
209 double P, Q, a1, a2, b, c1, c2, c3;
214 for (i1 = 1; i1 < *n; i1++) {
215 for (i2 = i1 + 1; i2 <= *n; i2++) {
217 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
226 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
227 int *variance, double *var, int *gamma, double *alpha)
229 int i1, i2, s1, s2, target, Nd, Ns, L;
230 double P, Q, a1, a2, b, c1, c2, c3;
233 for (i1 = 1; i1 < *n; i1++) {
234 for (i2 = i1 + 1; i2 <= *n; i2++) {
236 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
237 CHECK_PAIRWISE_DELETION
246 #define COMPUTE_DIST_F81\
247 p = ((double) Nd/L);\
248 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
249 else d[target] = -E*log(1 - p/E);\
251 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
252 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
255 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
256 int *variance, double *var, int *gamma, double *alpha)
258 int i1, i2, s1, s2, target, Nd, L;
262 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
265 for (i1 = 1; i1 < *n; i1++) {
266 for (i2 = i1 + 1; i2 <= *n; i2++) {
268 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
269 if (DifferentBase(x[s1], x[s2])) Nd++;
276 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
277 int *variance, double *var, int *gamma, double *alpha)
279 int i1, i2, s1, s2, target, Nd, L;
282 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
285 for (i1 = 1; i1 < *n; i1++) {
286 for (i2 = i1 + 1; i2 <= *n; i2++) {
288 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
289 CHECK_PAIRWISE_DELETION
290 if (DifferentBase(x[s1], x[s2])) Nd++;
298 #define COUNT_TS_TV1_TV2\
299 if (SameBase(x[s1], x[s2])) continue;\
301 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
305 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
308 #define COMPUTE_DIST_K81\
309 P = ((double) (Nd - Nv1 - Nv2)/L);\
310 Q = ((double) Nv1/L);\
311 R = ((double) Nv2/L);\
315 d[target] = -0.25*log(a1*a2*a3);\
317 a = (1/a1 + 1/a2)/2;\
318 b = (1/a1 + 1/a3)/2;\
319 c = (1/a2 + 1/a3)/2;\
320 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
323 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
324 int *variance, double *var)
326 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
327 double P, Q, R, a1, a2, a3, a, b, c;
332 for (i1 = 1; i1 < *n; i1++) {
333 for (i2 = i1 + 1; i2 <= *n; i2++) {
335 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
344 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
345 int *variance, double *var)
347 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
348 double P, Q, R, a1, a2, a3, a, b, c;
351 for (i1 = 1; i1 < *n; i1++) {
352 for (i2 = i1 + 1; i2 <= *n; i2++) {
353 Nd = Nv1 = Nv2 = L = 0;
354 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
355 CHECK_PAIRWISE_DELETION
364 #define PREPARE_BF_F84\
365 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
366 B = BF[0]*BF[2] + BF[1]*BF[3];\
367 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
369 #define COMPUTE_DIST_F84\
370 P = ((double) Ns/L);\
371 Q = ((double) (Nd - Ns)/L);\
372 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
377 a = t1/(t1 - t2 - t3);\
378 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
379 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
382 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
383 double *BF, int *variance, double *var)
385 int i1, i2, Nd, Ns, L, target, s1, s2;
386 double P, Q, A, B, C, a, b, t1, t2, t3;
392 for (i1 = 1; i1 < *n; i1++) {
393 for (i2 = i1 + 1; i2 <= *n; i2++) {
395 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
404 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
405 double *BF, int *variance, double *var)
407 int i1, i2, Nd, Ns, L, target, s1, s2;
408 double P, Q, A, B, C, a, b, t1, t2, t3;
413 for (i1 = 1; i1 < *n; i1++) {
414 for (i2 = i1 + 1; i2 <= *n; i2++) {
416 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
417 CHECK_PAIRWISE_DELETION
426 #define COMPUTE_DIST_T92\
427 P = ((double) Ns/L);\
428 Q = ((double) (Nd - Ns)/L);\
431 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
435 c3 = wg*(c1 - c2) + c2;\
436 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
439 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
440 double *BF, int *variance, double *var)
442 int i1, i2, Nd, Ns, L, target, s1, s2;
443 double P, Q, wg, a1, a2, c1, c2, c3;
446 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
449 for (i1 = 1; i1 < *n; i1++) {
450 for (i2 = i1 + 1; i2 <= *n; i2++) {
452 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
461 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
462 double *BF, int *variance, double *var)
464 int i1, i2, Nd, Ns, L, target, s1, s2;
465 double P, Q, wg, a1, a2, c1, c2, c3;
467 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
470 for (i1 = 1; i1 < *n; i1++) {
471 for (i2 = i1 + 1; i2 <= *n; i2++) {
473 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
474 CHECK_PAIRWISE_DELETION
483 /* returns 1 if one of the base is adenine and
484 the other one is guanine surely, 0 otherwise */
485 #define AdenineAndGuanine(a, b) (a | b) == 200
487 /* returns 1 if one of the base is cytosine and
488 the other one is thymine surely, 0 otherwise */
489 #define CytosineAndThymine(a, b) (a | b) == 56
491 #define PREPARE_BF_TN93\
494 k1 = 2 * BF[0] * BF[2] / gR;\
495 k2 = 2 * BF[1] * BF[3] / gY;\
496 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
498 #define COUNT_TS1_TS2_TV\
499 if (DifferentBase(x[s1], x[s2])) {\
501 if (AdenineAndGuanine(x[s1], x[s2])) {\
505 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
508 #define COMPUTE_DIST_TN93\
509 P1 = ((double) Ns1/L);\
510 P2 = ((double) Ns2/L);\
511 Q = ((double) (Nd - Ns1 - Ns2)/L);\
512 w1 = 1 - P1/k1 - Q/(2*gR);\
513 w2 = 1 - P2/k2 - Q/(2*gY);\
514 w3 = 1 - Q/(2*gR*gY);\
516 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
521 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
522 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
524 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
528 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
529 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
532 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
534 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
535 double *BF, int *variance, double *var,
536 int *gamma, double *alpha)
538 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
539 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
546 for (i1 = 1; i1 < *n; i1++) {
547 for (i2 = i1 + 1; i2 <= *n; i2++) {
549 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
558 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
559 double *BF, int *variance, double *var,
560 int *gamma, double *alpha)
562 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
563 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
568 for (i1 = 1; i1 < *n; i1++) {
569 for (i2 = i1 + 1; i2 <= *n; i2++) {
570 Nd = Ns1 = Ns2 = L = 0;
571 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
572 CHECK_PAIRWISE_DELETION
581 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
582 int *variance, double *var)
584 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
585 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
592 npair = *n * (*n - 1) / 2;
594 theta = (double*)R_alloc(*n, sizeof(double));
595 P = (double*)R_alloc(npair, sizeof(double));
596 Q = (double*)R_alloc(npair, sizeof(double));
597 tstvr = (double*)R_alloc(npair, sizeof(double));
599 /* get the proportion of GC (= theta) in each sequence */
600 for (i1 = 1; i1 <= *n; i1++) {
602 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
603 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
604 theta[i1 - 1] = ((double) GC / *s);
607 /* get the proportions of transitions and transversions,
608 and the estimates of their ratio for each pair */
610 for (i1 = 1; i1 < *n; i1++) {
611 for (i2 = i1 + 1; i2 <= *n; i2++) {
613 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
616 P[target] = ((double) Ns / *s);
617 Q[target] = ((double) (Nd - Ns) / *s);
618 A = log(1 - 2*Q[target]);
619 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
624 /* compute the mean alpha (ma) = mean Ts/Tv */
627 for (i1 = 0; i1 < npair; i1++)
628 /* some values of tstvr are -Inf if there is no
629 transversions observed: we exclude them */
630 if (R_FINITE(tstvr[i1])) {
636 /* compute the distance for each pair */
638 for (i1 = 1; i1 < *n; i1++) {
639 for (i2 = i1 + 1; i2 <= *n; i2++) {
641 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
642 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
643 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
645 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
651 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
652 int *variance, double *var)
654 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
655 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
663 npair = *n * (*n - 1) / 2;
665 theta = (double*)R_alloc(*n, sizeof(double));
666 L = (int*)R_alloc(npair, sizeof(int));
667 P = (double*)R_alloc(npair, sizeof(double));
668 Q = (double*)R_alloc(npair, sizeof(double));
669 tstvr = (double*)R_alloc(npair, sizeof(double));
671 /* get the proportion of GC (= theta) in each sequence */
672 for (i1 = 1; i1 <= *n; i1++) {
674 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
675 if (KnownBase(x[s1])) tl++;
677 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
679 theta[i1 - 1] = ((double) GC / tl);
682 /* get the proportions of transitions and transversions,
683 and the estimates of their ratio for each pair; we
684 also get the sample size for each pair in L */
686 for (i1 = 1; i1 < *n; i1++) {
687 for (i2 = i1 + 1; i2 <= *n; i2++) {
688 Nd = Ns = L[target] = 0;
689 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
690 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
694 P[target] = ((double) Ns/L[target]);
695 Q[target] = ((double) (Nd - Ns)/L[target]);
696 A = log(1 - 2*Q[target]);
697 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
702 /* compute the mean alpha (ma) = mean Ts/Tv */
705 for (i1 = 0; i1 < npair; i1++)
706 /* some values of tstvr are -Inf if there is no
707 transversions observed: we exclude them */
708 if (R_FINITE(tstvr[i1])) {
714 /* compute the distance for each pair */
716 for (i1 = 1; i1 < *n; i1++) {
717 for (i2 = i1 + 1; i2 <= *n; i2++) {
719 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
720 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
721 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
723 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
729 #define DO_CONTINGENCY_NUCLEOTIDES\
731 case 136 : m = 0; break;\
732 case 72 : m = 1; break;\
733 case 40 : m = 2; break;\
734 case 24 : m = 3; break;\
737 case 72 : m += 4; break;\
738 case 40 : m += 8; break;\
739 case 24 : m += 12; break;\
743 #define COMPUTE_DIST_LogDet\
744 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
745 d[target] = -log(detFourByFour(Ftab))/4 - LN4;\
747 /* For the inversion, we first make U an identity matrix */\
748 for (k = 1; k < 15; k++) U[k] = 0;\
749 U[0] = U[5] = U[10] = U[15] = 1;\
750 /* The matrix is not symmetric, so we use 'dgesv'. */\
751 /* This subroutine puts the result in U. */\
752 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
753 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
754 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
755 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
756 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
757 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
758 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
759 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
760 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
763 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
764 int *variance, double *var)
766 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
767 double Ftab[16], U[16];
772 for (i1 = 1; i1 < *n; i1++) {
773 for (i2 = i1 + 1; i2 <= *n; i2++) {
774 for (k = 0; k < 16; k++) Ntab[k] = 0;
775 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
776 DO_CONTINGENCY_NUCLEOTIDES
784 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
785 int *variance, double *var)
787 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
788 double Ftab[16], U[16];
791 for (i1 = 1; i1 < *n; i1++) {
792 for (i2 = i1 + 1; i2 <= *n; i2++) {
793 for (k = 0; k < 16; k++) Ntab[k] = 0;
795 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
796 CHECK_PAIRWISE_DELETION
797 DO_CONTINGENCY_NUCLEOTIDES
805 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
806 int *variance, double *var)
808 For the moment there is no need to check for pairwise deletions
809 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
810 In effect the pairwise deletion has possibly been done before.
811 The sequence length(s) are used only to compute the variances, which is
812 currently not available.
815 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4], ndim = 4, info, ipiv[16];
816 double P12[16], P21[16], U[16];
818 for (i1 = 1; i1 < *n; i1++) {
819 for (i2 = i1 + 1; i2 <= *n; i2++) {
820 for (k = 0; k < 16; k++) Ntab[k] = 0;
821 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
822 DO_CONTINGENCY_NUCLEOTIDES
825 /* get the rowwise sums of Ntab */
826 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
827 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
828 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
829 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
831 for (k = 0; k < 16; k++)
832 P12[k] = ((double) Ntab[k]);
834 /* scale each element of P12 by its rowwise sum */
835 for (k = 0; k < 4; k++)
836 for (kb = 0; kb < 16; kb += 4)
837 P12[k + kb] = P12[k + kb]/ROWsums[k];
839 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
841 /* compute the columnwise sums of Ntab: these
842 are the rowwise sums of its transpose */
843 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
844 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
845 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
846 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
848 /* transpose Ntab and store the result in P21 */
849 for (k = 0; k < 4; k++)
850 for (kb = 0; kb < 4; kb++)
851 P21[kb + 4*k] = Ntab[k + 4*kb];
854 for (k = 0; k < 4; k++)
855 for (kb = 0; kb < 16; kb += 4)
856 P21[k + kb] = P21[k + kb]/ROWsums[k];
858 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
863 #define COMPUTE_DIST_ParaLin\
864 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
865 d[target] = -log(detFourByFour(Ftab)/\
866 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
867 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
869 /* For the inversion, we first make U an identity matrix */\
870 for (k = 1; k < 15; k++) U[k] = 0;\
871 U[0] = U[5] = U[10] = U[15] = 1;\
872 /* The matrix is not symmetric, so we use 'dgesv'. */\
873 /* This subroutine puts the result in U. */\
874 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
875 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
876 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
877 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
878 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
879 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
880 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
881 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
882 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
883 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
884 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
885 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
886 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
889 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
890 int *variance, double *var)
892 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
893 double Ftab[16], U[16], *find[4];
897 for (k = 0; k < 4; k++)
898 find[k] = (double*)R_alloc(*n, sizeof(double));
900 for (i1 = 0; i1 < *n; i1++)
901 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
903 for (i1 = 0; i1 < *n; i1++) {
904 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
906 case 136 : find[0][i1]++; break;
907 case 40 : find[1][i1]++; break;
908 case 72 : find[2][i1]++; break;
909 case 24 : find[3][i1]++; break;
912 for (k = 0; k < 4; k++) find[k][i1] /= L;
916 for (i1 = 1; i1 < *n; i1++) {
917 for (i2 = i1 + 1; i2 <= *n; i2++) {
918 for (k = 0; k < 16; k++) Ntab[k] = 0;
919 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
920 DO_CONTINGENCY_NUCLEOTIDES
928 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
929 int *variance, double *var)
931 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
932 double Ftab[16], U[16], *find[4];
936 for (k = 0; k < 4; k++)
937 find[k] = (double*)R_alloc(*n, sizeof(double));
939 for (i1 = 0; i1 < *n; i1++)
940 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
942 for (i1 = 0; i1 < *n; i1++) {
944 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
945 if (KnownBase(x[s1])) {
948 case 136 : find[0][i1]++; break;
949 case 40 : find[1][i1]++; break;
950 case 72 : find[2][i1]++; break;
951 case 24 : find[3][i1]++; break;
955 for (k = 0; k < 4; k++) find[k][i1] /= L;
959 for (i1 = 1; i1 < *n; i1++) {
960 for (i2 = i1 + 1; i2 <= *n; i2++) {
962 for (k = 0; k < 16; k++) Ntab[k] = 0;
963 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
964 CHECK_PAIRWISE_DELETION
965 DO_CONTINGENCY_NUCLEOTIDES
973 /* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */
978 /* for (i = 0; i < *n; i++) { */
979 /* if (KnownBase(x[i])) { */
981 /* switch (x[i]) { */
982 /* case 136 : BF[0]++; break; */
983 /* case 40 : BF[1]++; break; */
984 /* case 72 : BF[2]++; break; */
985 /* case 24 : BF[3]++; break; */
989 /* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */
992 void BaseProportion(unsigned char *x, int *n, double *BF)
996 for (i = 0; i < *n; i++) {
998 case 136 : BF[0]++; break;
999 case 40 : BF[1]++; break;
1000 case 72 : BF[2]++; break;
1001 case 24 : BF[3]++; break;
1002 case 192 : BF[4]++; break;
1003 case 160 : BF[5]++; break;
1004 case 144 : BF[6]++; break;
1005 case 96 : BF[7]++; break;
1006 case 80 : BF[8]++; break;
1007 case 48 : BF[9]++; break;
1008 case 224 : BF[10]++; break;
1009 case 176 : BF[11]++; break;
1010 case 208 : BF[12]++; break;
1011 case 112 : BF[13]++; break;
1012 case 240 : BF[14]++; break;
1013 case 4 : BF[15]++; break;
1014 case 2 : BF[16]++; break;
1019 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1022 unsigned char basis;
1024 for (j = 0; j < *s; j++) {
1026 while (!KnownBase(x[i])) i++;
1029 while (i < *n * (j + 1)) {
1030 if (!KnownBase(x[i]) || x[i] == basis) i++;
1039 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1043 for (j = 0; j < *s; j++) {
1045 while (i < *n * (j + 1)) {
1046 if (KnownBase(x[i])) i++;
1055 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1056 double *BF, int *pairdel, int *variance, double *var,
1057 int *gamma, double *alpha)
1060 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1061 else distDNA_raw(x, n, s, d, 1); break;
1062 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1063 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1064 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1065 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1066 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1067 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1068 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1069 else distDNA_K81(x, n, s, d, variance, var); break;
1070 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1071 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1072 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1073 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1074 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1075 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1076 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1077 else distDNA_GG95(x, n, s, d, variance, var); break;
1078 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1079 else distDNA_LogDet(x, n, s, d, variance, var); break;
1080 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1081 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1082 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1083 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1084 else distDNA_raw(x, n, s, d, 0); break;
1085 case 14 : if (pairdel) distDNA_TsTv(x, n, s, d, 1, 1);
1086 else distDNA_TsTv(x, n, s, d, 1, 0); break;
1087 case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
1088 else distDNA_TsTv(x, n, s, d, 0, 0); break;