1 /* dist_dna.c 2012-02-14 */
3 /* Copyright 2005-2012 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
8 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
68 if (SameBase(x[s1], x[s2])) continue;\
70 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
74 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
76 void distDNA_indel(unsigned char *x, int *n, int *s, double *d)
78 int i1, i2, s1, s2, target, N;
81 for (i1 = 1; i1 < *n; i1++) {
82 for (i2 = i1 + 1; i2 <= *n; i2++) {
85 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
86 if ((x[s1] ^ x[s2]) & 4) N++;
88 d[target] = ((double) N);
94 #define X(i, j) i - 1 + *n * (j - 1)
96 #define DINDEX(i, j) n * (i - 1) - i*(i - 1)/2 + j - i - 1
98 int give_index(int i, int j, int n)
100 if (i > j) return(DINDEX(j, i));
101 else return(DINDEX(i, j));
104 void distDNA_indelblock(unsigned char *x, int *n, int *s, double *d)
106 int i1, i2, s1, s2, target, N, start_block, end_block;
108 for (i1 = 1; i1 <= *n; i1++) {
110 /* find a block of one or more '-' */
114 if (x[X(i1, s1)] == 4) {
116 while (x[X(i1, s1)] == 4) s1++;
119 /* then look if the same block is present in all the other sequences */
121 for (i2 = 1; i2 <= *n; i2++) {
122 if (i2 == i1) continue;
124 target = give_index(i1, i2, *n);
126 if (start_block > 1) {
127 if (x[X(i2, start_block - 1)] == 4) {
132 if (end_block < *s) {
133 if (x[X(i2, end_block + 1)] == 4) {
138 for (s2 = start_block; s2 <= end_block; s2++) {
139 if (x[X(i2, s2)] != 4) {
154 void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
156 int i1, i2, s1, s2, target, Nd, Ns;
159 for (i1 = 1; i1 < *n; i1++) {
160 for (i2 = i1 + 1; i2 <= *n; i2++) {
162 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
163 if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
166 if (Ts) d[target] = ((double) Ns); /* output number of transitions */
167 else d[target] = ((double) Nd - Ns); /* output number of transversions */
173 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
175 int i1, i2, s1, s2, target, Nd;
178 for (i1 = 1; i1 < *n; i1++) {
179 for (i2 = i1 + 1; i2 <= *n; i2++) {
181 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
182 if (DifferentBase(x[s1], x[s2])) Nd++;
183 if (scaled) d[target] = ((double) Nd / *s);
184 else d[target] = ((double) Nd);
190 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
192 int i1, i2, s1, s2, target, Nd, L;
195 for (i1 = 1; i1 < *n; i1++) {
196 for (i2 = i1 + 1; i2 <= *n; i2++) {
198 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
199 CHECK_PAIRWISE_DELETION
200 if (DifferentBase(x[s1], x[s2])) Nd++;
202 if (scaled) d[target] = ((double) Nd/L);
203 else d[target] = ((double) Nd);
209 #define COMPUTE_DIST_JC69\
210 p = ((double) Nd/L);\
212 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
213 else d[target] = -0.75*log(1 - 4*p/3);\
215 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
216 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
219 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
220 int *variance, double *var, int *gamma, double *alpha)
222 int i1, i2, s1, s2, target, Nd, L;
228 for (i1 = 1; i1 < *n; i1++) {
229 for (i2 = i1 + 1; i2 <= *n; i2++) {
231 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
232 if (DifferentBase(x[s1], x[s2])) Nd++;
239 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
240 int *variance, double *var, int *gamma, double *alpha)
242 int i1, i2, s1, s2, target, Nd, L;
246 for (i1 = 1; i1 < *n; i1++) {
247 for (i2 = i1 + 1; i2 <= *n; i2++) {
249 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
250 CHECK_PAIRWISE_DELETION
251 if (DifferentBase(x[s1], x[s2])) Nd++;
259 #define COMPUTE_DIST_K80\
260 P = ((double) Ns/L);\
261 Q = ((double) (Nd - Ns)/L);\
266 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
268 else d[target] = -0.5 * log(a1 * sqrt(a2));\
271 b = -(1 / *alpha + 1);\
280 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
283 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
284 int *variance, double *var, int *gamma, double *alpha)
286 int i1, i2, s1, s2, target, Nd, Ns, L;
287 double P, Q, a1, a2, b, c1, c2, c3;
292 for (i1 = 1; i1 < *n; i1++) {
293 for (i2 = i1 + 1; i2 <= *n; i2++) {
295 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
304 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
305 int *variance, double *var, int *gamma, double *alpha)
307 int i1, i2, s1, s2, target, Nd, Ns, L;
308 double P, Q, a1, a2, b, c1, c2, c3;
311 for (i1 = 1; i1 < *n; i1++) {
312 for (i2 = i1 + 1; i2 <= *n; i2++) {
314 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
315 CHECK_PAIRWISE_DELETION
324 #define COMPUTE_DIST_F81\
325 p = ((double) Nd/L);\
326 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
327 else d[target] = -E*log(1 - p/E);\
329 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
330 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
333 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
334 int *variance, double *var, int *gamma, double *alpha)
336 int i1, i2, s1, s2, target, Nd, L;
340 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
343 for (i1 = 1; i1 < *n; i1++) {
344 for (i2 = i1 + 1; i2 <= *n; i2++) {
346 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
347 if (DifferentBase(x[s1], x[s2])) Nd++;
354 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
355 int *variance, double *var, int *gamma, double *alpha)
357 int i1, i2, s1, s2, target, Nd, L;
360 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
363 for (i1 = 1; i1 < *n; i1++) {
364 for (i2 = i1 + 1; i2 <= *n; i2++) {
366 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
367 CHECK_PAIRWISE_DELETION
368 if (DifferentBase(x[s1], x[s2])) Nd++;
376 #define COUNT_TS_TV1_TV2\
377 if (SameBase(x[s1], x[s2])) continue;\
379 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
383 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
386 #define COMPUTE_DIST_K81\
387 P = ((double) (Nd - Nv1 - Nv2)/L);\
388 Q = ((double) Nv1/L);\
389 R = ((double) Nv2/L);\
393 d[target] = -0.25*log(a1*a2*a3);\
395 a = (1/a1 + 1/a2)/2;\
396 b = (1/a1 + 1/a3)/2;\
397 c = (1/a2 + 1/a3)/2;\
398 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
401 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
402 int *variance, double *var)
404 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
405 double P, Q, R, a1, a2, a3, a, b, c;
410 for (i1 = 1; i1 < *n; i1++) {
411 for (i2 = i1 + 1; i2 <= *n; i2++) {
413 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
422 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
423 int *variance, double *var)
425 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
426 double P, Q, R, a1, a2, a3, a, b, c;
429 for (i1 = 1; i1 < *n; i1++) {
430 for (i2 = i1 + 1; i2 <= *n; i2++) {
431 Nd = Nv1 = Nv2 = L = 0;
432 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
433 CHECK_PAIRWISE_DELETION
442 #define PREPARE_BF_F84\
443 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
444 B = BF[0]*BF[2] + BF[1]*BF[3];\
445 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
447 #define COMPUTE_DIST_F84\
448 P = ((double) Ns/L);\
449 Q = ((double) (Nd - Ns)/L);\
450 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
455 a = t1/(t1 - t2 - t3);\
456 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
457 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
460 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
461 double *BF, int *variance, double *var)
463 int i1, i2, Nd, Ns, L, target, s1, s2;
464 double P, Q, A, B, C, a, b, t1, t2, t3;
470 for (i1 = 1; i1 < *n; i1++) {
471 for (i2 = i1 + 1; i2 <= *n; i2++) {
473 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
482 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
483 double *BF, int *variance, double *var)
485 int i1, i2, Nd, Ns, L, target, s1, s2;
486 double P, Q, A, B, C, a, b, t1, t2, t3;
491 for (i1 = 1; i1 < *n; i1++) {
492 for (i2 = i1 + 1; i2 <= *n; i2++) {
494 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
495 CHECK_PAIRWISE_DELETION
504 #define COMPUTE_DIST_T92\
505 P = ((double) Ns/L);\
506 Q = ((double) (Nd - Ns)/L);\
509 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
513 c3 = wg*(c1 - c2) + c2;\
514 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
517 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
518 double *BF, int *variance, double *var)
520 int i1, i2, Nd, Ns, L, target, s1, s2;
521 double P, Q, wg, a1, a2, c1, c2, c3;
524 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
527 for (i1 = 1; i1 < *n; i1++) {
528 for (i2 = i1 + 1; i2 <= *n; i2++) {
530 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
539 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
540 double *BF, int *variance, double *var)
542 int i1, i2, Nd, Ns, L, target, s1, s2;
543 double P, Q, wg, a1, a2, c1, c2, c3;
545 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
548 for (i1 = 1; i1 < *n; i1++) {
549 for (i2 = i1 + 1; i2 <= *n; i2++) {
551 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
552 CHECK_PAIRWISE_DELETION
561 /* returns 1 if one of the base is adenine and
562 the other one is guanine surely, 0 otherwise */
563 #define AdenineAndGuanine(a, b) (a | b) == 200
565 /* returns 1 if one of the base is cytosine and
566 the other one is thymine surely, 0 otherwise */
567 #define CytosineAndThymine(a, b) (a | b) == 56
569 #define PREPARE_BF_TN93\
572 k1 = 2 * BF[0] * BF[2] / gR;\
573 k2 = 2 * BF[1] * BF[3] / gY;\
574 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
576 #define COUNT_TS1_TS2_TV\
577 if (DifferentBase(x[s1], x[s2])) {\
579 if (AdenineAndGuanine(x[s1], x[s2])) {\
583 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
586 #define COMPUTE_DIST_TN93\
587 P1 = ((double) Ns1/L);\
588 P2 = ((double) Ns2/L);\
589 Q = ((double) (Nd - Ns1 - Ns2)/L);\
590 w1 = 1 - P1/k1 - Q/(2*gR);\
591 w2 = 1 - P2/k2 - Q/(2*gY);\
592 w3 = 1 - Q/(2*gR*gY);\
594 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
599 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
600 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
602 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
606 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
607 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
610 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
612 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
613 double *BF, int *variance, double *var,
614 int *gamma, double *alpha)
616 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
617 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
624 for (i1 = 1; i1 < *n; i1++) {
625 for (i2 = i1 + 1; i2 <= *n; i2++) {
627 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
636 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
637 double *BF, int *variance, double *var,
638 int *gamma, double *alpha)
640 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
641 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
646 for (i1 = 1; i1 < *n; i1++) {
647 for (i2 = i1 + 1; i2 <= *n; i2++) {
648 Nd = Ns1 = Ns2 = L = 0;
649 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
650 CHECK_PAIRWISE_DELETION
659 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
660 int *variance, double *var)
662 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
663 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
670 npair = *n * (*n - 1) / 2;
672 theta = (double*)R_alloc(*n, sizeof(double));
673 P = (double*)R_alloc(npair, sizeof(double));
674 Q = (double*)R_alloc(npair, sizeof(double));
675 tstvr = (double*)R_alloc(npair, sizeof(double));
677 /* get the proportion of GC (= theta) in each sequence */
678 for (i1 = 1; i1 <= *n; i1++) {
680 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
681 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
682 theta[i1 - 1] = ((double) GC / *s);
685 /* get the proportions of transitions and transversions,
686 and the estimates of their ratio for each pair */
688 for (i1 = 1; i1 < *n; i1++) {
689 for (i2 = i1 + 1; i2 <= *n; i2++) {
691 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
694 P[target] = ((double) Ns / *s);
695 Q[target] = ((double) (Nd - Ns) / *s);
696 A = log(1 - 2*Q[target]);
697 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
702 /* compute the mean alpha (ma) = mean Ts/Tv */
705 for (i1 = 0; i1 < npair; i1++)
706 /* some values of tstvr are -Inf if there is no
707 transversions observed: we exclude them */
708 if (R_FINITE(tstvr[i1])) {
714 /* compute the distance for each pair */
716 for (i1 = 1; i1 < *n; i1++) {
717 for (i2 = i1 + 1; i2 <= *n; i2++) {
719 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
720 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
721 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
723 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
729 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
730 int *variance, double *var)
732 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
733 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
741 npair = *n * (*n - 1) / 2;
743 theta = (double*)R_alloc(*n, sizeof(double));
744 L = (int*)R_alloc(npair, sizeof(int));
745 P = (double*)R_alloc(npair, sizeof(double));
746 Q = (double*)R_alloc(npair, sizeof(double));
747 tstvr = (double*)R_alloc(npair, sizeof(double));
749 /* get the proportion of GC (= theta) in each sequence */
750 for (i1 = 1; i1 <= *n; i1++) {
752 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
753 if (KnownBase(x[s1])) tl++;
755 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
757 theta[i1 - 1] = ((double) GC / tl);
760 /* get the proportions of transitions and transversions,
761 and the estimates of their ratio for each pair; we
762 also get the sample size for each pair in L */
764 for (i1 = 1; i1 < *n; i1++) {
765 for (i2 = i1 + 1; i2 <= *n; i2++) {
766 Nd = Ns = L[target] = 0;
767 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
768 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
772 P[target] = ((double) Ns/L[target]);
773 Q[target] = ((double) (Nd - Ns)/L[target]);
774 A = log(1 - 2*Q[target]);
775 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
780 /* compute the mean alpha (ma) = mean Ts/Tv */
783 for (i1 = 0; i1 < npair; i1++)
784 /* some values of tstvr are -Inf if there is no
785 transversions observed: we exclude them */
786 if (R_FINITE(tstvr[i1])) {
792 /* compute the distance for each pair */
794 for (i1 = 1; i1 < *n; i1++) {
795 for (i2 = i1 + 1; i2 <= *n; i2++) {
797 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
798 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
799 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
801 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
807 #define DO_CONTINGENCY_NUCLEOTIDES\
809 case 136 : m = 0; break;\
810 case 72 : m = 1; break;\
811 case 40 : m = 2; break;\
812 case 24 : m = 3; break;\
815 case 72 : m += 4; break;\
816 case 40 : m += 8; break;\
817 case 24 : m += 12; break;\
821 #define COMPUTE_DIST_LogDet\
822 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
823 d[target] = -log(detFourByFour(Ftab))/4 - LN4;\
825 /* For the inversion, we first make U an identity matrix */\
826 for (k = 1; k < 15; k++) U[k] = 0;\
827 U[0] = U[5] = U[10] = U[15] = 1;\
828 /* The matrix is not symmetric, so we use 'dgesv'. */\
829 /* This subroutine puts the result in U. */\
830 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
831 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
832 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
833 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
834 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
835 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
836 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
837 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
838 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
841 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
842 int *variance, double *var)
844 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
845 double Ftab[16], U[16];
850 for (i1 = 1; i1 < *n; i1++) {
851 for (i2 = i1 + 1; i2 <= *n; i2++) {
852 for (k = 0; k < 16; k++) Ntab[k] = 0;
853 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
854 DO_CONTINGENCY_NUCLEOTIDES
862 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
863 int *variance, double *var)
865 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
866 double Ftab[16], U[16];
869 for (i1 = 1; i1 < *n; i1++) {
870 for (i2 = i1 + 1; i2 <= *n; i2++) {
871 for (k = 0; k < 16; k++) Ntab[k] = 0;
873 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
874 CHECK_PAIRWISE_DELETION
875 DO_CONTINGENCY_NUCLEOTIDES
883 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
884 int *variance, double *var)
886 For the moment there is no need to check for pairwise deletions
887 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
888 In effect the pairwise deletion has possibly been done before.
889 The sequence length(s) are used only to compute the variances, which is
890 currently not available.
893 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4];
894 double P12[16], P21[16];
896 for (i1 = 1; i1 < *n; i1++) {
897 for (i2 = i1 + 1; i2 <= *n; i2++) {
898 for (k = 0; k < 16; k++) Ntab[k] = 0;
899 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
900 DO_CONTINGENCY_NUCLEOTIDES
903 /* get the rowwise sums of Ntab */
904 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
905 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
906 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
907 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
909 for (k = 0; k < 16; k++)
910 P12[k] = ((double) Ntab[k]);
912 /* scale each element of P12 by its rowwise sum */
913 for (k = 0; k < 4; k++)
914 for (kb = 0; kb < 16; kb += 4)
915 P12[k + kb] = P12[k + kb]/ROWsums[k];
917 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
919 /* compute the columnwise sums of Ntab: these
920 are the rowwise sums of its transpose */
921 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
922 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
923 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
924 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
926 /* transpose Ntab and store the result in P21 */
927 for (k = 0; k < 4; k++)
928 for (kb = 0; kb < 4; kb++)
929 P21[kb + 4*k] = Ntab[k + 4*kb];
932 for (k = 0; k < 4; k++)
933 for (kb = 0; kb < 16; kb += 4)
934 P21[k + kb] = P21[k + kb]/ROWsums[k];
936 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
941 #define COMPUTE_DIST_ParaLin\
942 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
943 d[target] = -log(detFourByFour(Ftab)/\
944 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
945 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
947 /* For the inversion, we first make U an identity matrix */\
948 for (k = 1; k < 15; k++) U[k] = 0;\
949 U[0] = U[5] = U[10] = U[15] = 1;\
950 /* The matrix is not symmetric, so we use 'dgesv'. */\
951 /* This subroutine puts the result in U. */\
952 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
953 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
954 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
955 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
956 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
957 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
958 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
959 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
960 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
961 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
962 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
963 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
964 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
967 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
968 int *variance, double *var)
970 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
971 double Ftab[16], U[16], *find[4];
975 for (k = 0; k < 4; k++)
976 find[k] = (double*)R_alloc(*n, sizeof(double));
978 for (i1 = 0; i1 < *n; i1++)
979 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
981 for (i1 = 0; i1 < *n; i1++) {
982 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
984 case 136 : find[0][i1]++; break;
985 case 40 : find[1][i1]++; break;
986 case 72 : find[2][i1]++; break;
987 case 24 : find[3][i1]++; break;
990 for (k = 0; k < 4; k++) find[k][i1] /= L;
994 for (i1 = 1; i1 < *n; i1++) {
995 for (i2 = i1 + 1; i2 <= *n; i2++) {
996 for (k = 0; k < 16; k++) Ntab[k] = 0;
997 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
998 DO_CONTINGENCY_NUCLEOTIDES
1000 COMPUTE_DIST_ParaLin
1006 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
1007 int *variance, double *var)
1009 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
1010 double Ftab[16], U[16], *find[4];
1014 for (k = 0; k < 4; k++)
1015 find[k] = (double*)R_alloc(*n, sizeof(double));
1017 for (i1 = 0; i1 < *n; i1++)
1018 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
1020 for (i1 = 0; i1 < *n; i1++) {
1022 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
1023 if (KnownBase(x[s1])) {
1026 case 136 : find[0][i1]++; break;
1027 case 40 : find[1][i1]++; break;
1028 case 72 : find[2][i1]++; break;
1029 case 24 : find[3][i1]++; break;
1033 for (k = 0; k < 4; k++) find[k][i1] /= L;
1037 for (i1 = 1; i1 < *n; i1++) {
1038 for (i2 = i1 + 1; i2 <= *n; i2++) {
1040 for (k = 0; k < 16; k++) Ntab[k] = 0;
1041 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
1042 CHECK_PAIRWISE_DELETION
1043 DO_CONTINGENCY_NUCLEOTIDES
1045 COMPUTE_DIST_ParaLin
1051 /* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */
1056 /* for (i = 0; i < *n; i++) { */
1057 /* if (KnownBase(x[i])) { */
1059 /* switch (x[i]) { */
1060 /* case 136 : BF[0]++; break; */
1061 /* case 40 : BF[1]++; break; */
1062 /* case 72 : BF[2]++; break; */
1063 /* case 24 : BF[3]++; break; */
1067 /* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */
1070 /* void BaseProportion(unsigned char *x, int *n, double *BF) */
1074 /* for (i = 0; i < *n; i++) { */
1075 /* switch (x[i]) { */
1076 /* case 136 : BF[0]++; break; */
1077 /* case 40 : BF[1]++; break; */
1078 /* case 72 : BF[2]++; break; */
1079 /* case 24 : BF[3]++; break; */
1080 /* case 192 : BF[4]++; break; */
1081 /* case 160 : BF[5]++; break; */
1082 /* case 144 : BF[6]++; break; */
1083 /* case 96 : BF[7]++; break; */
1084 /* case 80 : BF[8]++; break; */
1085 /* case 48 : BF[9]++; break; */
1086 /* case 224 : BF[10]++; break; */
1087 /* case 176 : BF[11]++; break; */
1088 /* case 208 : BF[12]++; break; */
1089 /* case 112 : BF[13]++; break; */
1090 /* case 240 : BF[14]++; break; */
1091 /* case 4 : BF[15]++; break; */
1092 /* case 2 : BF[16]++; break; */
1097 /* a hash table is much faster than switch (2012-01-10) */
1098 void BaseProportion(unsigned char *x, int *n, double *BF)
1103 memset(count, 0, 256*sizeof(double));
1105 for (i = 0; i < *n; i++) count[x[i]]++;
1117 BF[10] = count[224];
1118 BF[11] = count[176];
1119 BF[12] = count[208];
1120 BF[13] = count[112];
1121 BF[14] = count[240];
1126 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1129 unsigned char basis;
1131 for (j = 0; j < *s; j++) {
1133 while (!KnownBase(x[i])) i++;
1136 while (i < *n * (j + 1)) {
1137 if (!KnownBase(x[i]) || x[i] == basis) i++;
1146 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1150 for (j = 0; j < *s; j++) {
1152 while (i < *n * (j + 1)) {
1153 if (KnownBase(x[i])) i++;
1162 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1163 double *BF, int *pairdel, int *variance, double *var,
1164 int *gamma, double *alpha)
1167 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1168 else distDNA_raw(x, n, s, d, 1); break;
1169 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1170 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1171 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1172 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1173 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1174 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1175 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1176 else distDNA_K81(x, n, s, d, variance, var); break;
1177 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1178 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1179 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1180 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1181 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1182 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1183 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1184 else distDNA_GG95(x, n, s, d, variance, var); break;
1185 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1186 else distDNA_LogDet(x, n, s, d, variance, var); break;
1187 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1188 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1189 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1190 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1191 else distDNA_raw(x, n, s, d, 0); break;
1192 case 14 : if (pairdel) distDNA_TsTv(x, n, s, d, 1, 1);
1193 else distDNA_TsTv(x, n, s, d, 1, 0); break;
1194 case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
1195 else distDNA_TsTv(x, n, s, d, 0, 0); break;
1196 case 16 : distDNA_indel(x, n, s, d); break;
1197 case 17 : distDNA_indelblock(x, n, s, d); break;