1 /* dist_dna.c 2012-02-14 */
3 /* Copyright 2005-2012 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
8 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
68 if (SameBase(x[s1], x[s2])) continue;\
70 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
74 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
76 void distDNA_indel(unsigned char *x, int *n, int *s, double *d)
78 int i1, i2, s1, s2, target, N;
81 for (i1 = 1; i1 < *n; i1++) {
82 for (i2 = i1 + 1; i2 <= *n; i2++) {
85 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
86 if ((x[s1] ^ x[s2]) & 4) N++;
88 d[target] = ((double) N);
94 #define X(i, j) i - 1 + *n * (j - 1)
96 void distDNA_indelblock(unsigned char *x, int *n, int *s, double *d)
98 int i1, i2, s1, s2, target, N, start_block, end_block;
100 for (i1 = 1; i1 <= *n; i1++) {
102 /* find a block of one or more '-' */
106 if (x[X(i1, s1)] == 4) {
108 while (x[X(i1, s1)] == 4) s1++;
111 /* then look if the same block is present in all the other sequences */
113 for (i2 = 1; i2 <= *n; i2++) {
114 if (i2 == i1) continue;
116 target = give_index(i1, i2, *n);
118 if (start_block > 1) {
119 if (x[X(i2, start_block - 1)] == 4) {
124 if (end_block < *s) {
125 if (x[X(i2, end_block + 1)] == 4) {
130 for (s2 = start_block; s2 <= end_block; s2++) {
131 if (x[X(i2, s2)] != 4) {
146 void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
148 int i1, i2, s1, s2, target, Nd, Ns;
151 for (i1 = 1; i1 < *n; i1++) {
152 for (i2 = i1 + 1; i2 <= *n; i2++) {
154 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
155 if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
158 if (Ts) d[target] = ((double) Ns); /* output number of transitions */
159 else d[target] = ((double) Nd - Ns); /* output number of transversions */
165 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
167 int i1, i2, s1, s2, target, Nd;
170 for (i1 = 1; i1 < *n; i1++) {
171 for (i2 = i1 + 1; i2 <= *n; i2++) {
173 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
174 if (DifferentBase(x[s1], x[s2])) Nd++;
175 if (scaled) d[target] = ((double) Nd / *s);
176 else d[target] = ((double) Nd);
182 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
184 int i1, i2, s1, s2, target, Nd, L;
187 for (i1 = 1; i1 < *n; i1++) {
188 for (i2 = i1 + 1; i2 <= *n; i2++) {
190 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
191 CHECK_PAIRWISE_DELETION
192 if (DifferentBase(x[s1], x[s2])) Nd++;
194 if (scaled) d[target] = ((double) Nd/L);
195 else d[target] = ((double) Nd);
201 #define COMPUTE_DIST_JC69\
202 p = ((double) Nd/L);\
204 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
205 else d[target] = -0.75*log(1 - 4*p/3);\
207 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
208 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
211 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
212 int *variance, double *var, int *gamma, double *alpha)
214 int i1, i2, s1, s2, target, Nd, L;
220 for (i1 = 1; i1 < *n; i1++) {
221 for (i2 = i1 + 1; i2 <= *n; i2++) {
223 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
224 if (DifferentBase(x[s1], x[s2])) Nd++;
231 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
232 int *variance, double *var, int *gamma, double *alpha)
234 int i1, i2, s1, s2, target, Nd, L;
238 for (i1 = 1; i1 < *n; i1++) {
239 for (i2 = i1 + 1; i2 <= *n; i2++) {
241 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
242 CHECK_PAIRWISE_DELETION
243 if (DifferentBase(x[s1], x[s2])) Nd++;
251 #define COMPUTE_DIST_K80\
252 P = ((double) Ns/L);\
253 Q = ((double) (Nd - Ns)/L);\
258 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
260 else d[target] = -0.5 * log(a1 * sqrt(a2));\
263 b = -(1 / *alpha + 1);\
272 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
275 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
276 int *variance, double *var, int *gamma, double *alpha)
278 int i1, i2, s1, s2, target, Nd, Ns, L;
279 double P, Q, a1, a2, b, c1, c2, c3;
284 for (i1 = 1; i1 < *n; i1++) {
285 for (i2 = i1 + 1; i2 <= *n; i2++) {
287 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
296 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
297 int *variance, double *var, int *gamma, double *alpha)
299 int i1, i2, s1, s2, target, Nd, Ns, L;
300 double P, Q, a1, a2, b, c1, c2, c3;
303 for (i1 = 1; i1 < *n; i1++) {
304 for (i2 = i1 + 1; i2 <= *n; i2++) {
306 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
307 CHECK_PAIRWISE_DELETION
316 #define COMPUTE_DIST_F81\
317 p = ((double) Nd/L);\
318 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
319 else d[target] = -E*log(1 - p/E);\
321 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
322 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
325 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
326 int *variance, double *var, int *gamma, double *alpha)
328 int i1, i2, s1, s2, target, Nd, L;
332 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
335 for (i1 = 1; i1 < *n; i1++) {
336 for (i2 = i1 + 1; i2 <= *n; i2++) {
338 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
339 if (DifferentBase(x[s1], x[s2])) Nd++;
346 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
347 int *variance, double *var, int *gamma, double *alpha)
349 int i1, i2, s1, s2, target, Nd, L;
352 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
355 for (i1 = 1; i1 < *n; i1++) {
356 for (i2 = i1 + 1; i2 <= *n; i2++) {
358 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
359 CHECK_PAIRWISE_DELETION
360 if (DifferentBase(x[s1], x[s2])) Nd++;
368 #define COUNT_TS_TV1_TV2\
369 if (SameBase(x[s1], x[s2])) continue;\
371 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
375 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
378 #define COMPUTE_DIST_K81\
379 P = ((double) (Nd - Nv1 - Nv2)/L);\
380 Q = ((double) Nv1/L);\
381 R = ((double) Nv2/L);\
385 d[target] = -0.25*log(a1*a2*a3);\
387 a = (1/a1 + 1/a2)/2;\
388 b = (1/a1 + 1/a3)/2;\
389 c = (1/a2 + 1/a3)/2;\
390 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
393 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
394 int *variance, double *var)
396 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
397 double P, Q, R, a1, a2, a3, a, b, c;
402 for (i1 = 1; i1 < *n; i1++) {
403 for (i2 = i1 + 1; i2 <= *n; i2++) {
405 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
414 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
415 int *variance, double *var)
417 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
418 double P, Q, R, a1, a2, a3, a, b, c;
421 for (i1 = 1; i1 < *n; i1++) {
422 for (i2 = i1 + 1; i2 <= *n; i2++) {
423 Nd = Nv1 = Nv2 = L = 0;
424 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
425 CHECK_PAIRWISE_DELETION
434 #define PREPARE_BF_F84\
435 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
436 B = BF[0]*BF[2] + BF[1]*BF[3];\
437 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
439 #define COMPUTE_DIST_F84\
440 P = ((double) Ns/L);\
441 Q = ((double) (Nd - Ns)/L);\
442 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
447 a = t1/(t1 - t2 - t3);\
448 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
449 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
452 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
453 double *BF, int *variance, double *var)
455 int i1, i2, Nd, Ns, L, target, s1, s2;
456 double P, Q, A, B, C, a, b, t1, t2, t3;
462 for (i1 = 1; i1 < *n; i1++) {
463 for (i2 = i1 + 1; i2 <= *n; i2++) {
465 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
474 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
475 double *BF, int *variance, double *var)
477 int i1, i2, Nd, Ns, L, target, s1, s2;
478 double P, Q, A, B, C, a, b, t1, t2, t3;
483 for (i1 = 1; i1 < *n; i1++) {
484 for (i2 = i1 + 1; i2 <= *n; i2++) {
486 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
487 CHECK_PAIRWISE_DELETION
496 #define COMPUTE_DIST_T92\
497 P = ((double) Ns/L);\
498 Q = ((double) (Nd - Ns)/L);\
501 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
505 c3 = wg*(c1 - c2) + c2;\
506 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
509 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
510 double *BF, int *variance, double *var)
512 int i1, i2, Nd, Ns, L, target, s1, s2;
513 double P, Q, wg, a1, a2, c1, c2, c3;
516 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
519 for (i1 = 1; i1 < *n; i1++) {
520 for (i2 = i1 + 1; i2 <= *n; i2++) {
522 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
531 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
532 double *BF, int *variance, double *var)
534 int i1, i2, Nd, Ns, L, target, s1, s2;
535 double P, Q, wg, a1, a2, c1, c2, c3;
537 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
540 for (i1 = 1; i1 < *n; i1++) {
541 for (i2 = i1 + 1; i2 <= *n; i2++) {
543 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
544 CHECK_PAIRWISE_DELETION
553 /* returns 1 if one of the base is adenine and
554 the other one is guanine surely, 0 otherwise */
555 #define AdenineAndGuanine(a, b) (a | b) == 200
557 /* returns 1 if one of the base is cytosine and
558 the other one is thymine surely, 0 otherwise */
559 #define CytosineAndThymine(a, b) (a | b) == 56
561 #define PREPARE_BF_TN93\
564 k1 = 2 * BF[0] * BF[2] / gR;\
565 k2 = 2 * BF[1] * BF[3] / gY;\
566 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
568 #define COUNT_TS1_TS2_TV\
569 if (DifferentBase(x[s1], x[s2])) {\
571 if (AdenineAndGuanine(x[s1], x[s2])) {\
575 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
578 #define COMPUTE_DIST_TN93\
579 P1 = ((double) Ns1/L);\
580 P2 = ((double) Ns2/L);\
581 Q = ((double) (Nd - Ns1 - Ns2)/L);\
582 w1 = 1 - P1/k1 - Q/(2*gR);\
583 w2 = 1 - P2/k2 - Q/(2*gY);\
584 w3 = 1 - Q/(2*gR*gY);\
586 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
591 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
592 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
594 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
598 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
599 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
602 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
604 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
605 double *BF, int *variance, double *var,
606 int *gamma, double *alpha)
608 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
609 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
616 for (i1 = 1; i1 < *n; i1++) {
617 for (i2 = i1 + 1; i2 <= *n; i2++) {
619 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
628 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
629 double *BF, int *variance, double *var,
630 int *gamma, double *alpha)
632 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
633 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
638 for (i1 = 1; i1 < *n; i1++) {
639 for (i2 = i1 + 1; i2 <= *n; i2++) {
640 Nd = Ns1 = Ns2 = L = 0;
641 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
642 CHECK_PAIRWISE_DELETION
651 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
652 int *variance, double *var)
654 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
655 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
662 npair = *n * (*n - 1) / 2;
664 theta = (double*)R_alloc(*n, sizeof(double));
665 P = (double*)R_alloc(npair, sizeof(double));
666 Q = (double*)R_alloc(npair, sizeof(double));
667 tstvr = (double*)R_alloc(npair, sizeof(double));
669 /* get the proportion of GC (= theta) in each sequence */
670 for (i1 = 1; i1 <= *n; i1++) {
672 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
673 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
674 theta[i1 - 1] = ((double) GC / *s);
677 /* get the proportions of transitions and transversions,
678 and the estimates of their ratio for each pair */
680 for (i1 = 1; i1 < *n; i1++) {
681 for (i2 = i1 + 1; i2 <= *n; i2++) {
683 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
686 P[target] = ((double) Ns / *s);
687 Q[target] = ((double) (Nd - Ns) / *s);
688 A = log(1 - 2*Q[target]);
689 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
694 /* compute the mean alpha (ma) = mean Ts/Tv */
697 for (i1 = 0; i1 < npair; i1++)
698 /* some values of tstvr are -Inf if there is no
699 transversions observed: we exclude them */
700 if (R_FINITE(tstvr[i1])) {
706 /* compute the distance for each pair */
708 for (i1 = 1; i1 < *n; i1++) {
709 for (i2 = i1 + 1; i2 <= *n; i2++) {
711 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
712 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
713 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
715 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
721 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
722 int *variance, double *var)
724 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
725 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
733 npair = *n * (*n - 1) / 2;
735 theta = (double*)R_alloc(*n, sizeof(double));
736 L = (int*)R_alloc(npair, sizeof(int));
737 P = (double*)R_alloc(npair, sizeof(double));
738 Q = (double*)R_alloc(npair, sizeof(double));
739 tstvr = (double*)R_alloc(npair, sizeof(double));
741 /* get the proportion of GC (= theta) in each sequence */
742 for (i1 = 1; i1 <= *n; i1++) {
744 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
745 if (KnownBase(x[s1])) tl++;
747 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
749 theta[i1 - 1] = ((double) GC / tl);
752 /* get the proportions of transitions and transversions,
753 and the estimates of their ratio for each pair; we
754 also get the sample size for each pair in L */
756 for (i1 = 1; i1 < *n; i1++) {
757 for (i2 = i1 + 1; i2 <= *n; i2++) {
758 Nd = Ns = L[target] = 0;
759 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
760 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
764 P[target] = ((double) Ns/L[target]);
765 Q[target] = ((double) (Nd - Ns)/L[target]);
766 A = log(1 - 2*Q[target]);
767 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
772 /* compute the mean alpha (ma) = mean Ts/Tv */
775 for (i1 = 0; i1 < npair; i1++)
776 /* some values of tstvr are -Inf if there is no
777 transversions observed: we exclude them */
778 if (R_FINITE(tstvr[i1])) {
784 /* compute the distance for each pair */
786 for (i1 = 1; i1 < *n; i1++) {
787 for (i2 = i1 + 1; i2 <= *n; i2++) {
789 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
790 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
791 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
793 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
799 #define DO_CONTINGENCY_NUCLEOTIDES\
801 case 136 : m = 0; break;\
802 case 72 : m = 1; break;\
803 case 40 : m = 2; break;\
804 case 24 : m = 3; break;\
807 case 72 : m += 4; break;\
808 case 40 : m += 8; break;\
809 case 24 : m += 12; break;\
813 #define COMPUTE_DIST_LogDet\
814 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
815 d[target] = -log(detFourByFour(Ftab))/4 - LN4;\
817 /* For the inversion, we first make U an identity matrix */\
818 for (k = 1; k < 15; k++) U[k] = 0;\
819 U[0] = U[5] = U[10] = U[15] = 1;\
820 /* The matrix is not symmetric, so we use 'dgesv'. */\
821 /* This subroutine puts the result in U. */\
822 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
823 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
824 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
825 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
826 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
827 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
828 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
829 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
830 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
833 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
834 int *variance, double *var)
836 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
837 double Ftab[16], U[16];
842 for (i1 = 1; i1 < *n; i1++) {
843 for (i2 = i1 + 1; i2 <= *n; i2++) {
844 for (k = 0; k < 16; k++) Ntab[k] = 0;
845 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
846 DO_CONTINGENCY_NUCLEOTIDES
854 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
855 int *variance, double *var)
857 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
858 double Ftab[16], U[16];
861 for (i1 = 1; i1 < *n; i1++) {
862 for (i2 = i1 + 1; i2 <= *n; i2++) {
863 for (k = 0; k < 16; k++) Ntab[k] = 0;
865 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
866 CHECK_PAIRWISE_DELETION
867 DO_CONTINGENCY_NUCLEOTIDES
875 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
876 int *variance, double *var)
878 For the moment there is no need to check for pairwise deletions
879 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
880 In effect the pairwise deletion has possibly been done before.
881 The sequence length(s) are used only to compute the variances, which is
882 currently not available.
885 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4];
886 double P12[16], P21[16];
888 for (i1 = 1; i1 < *n; i1++) {
889 for (i2 = i1 + 1; i2 <= *n; i2++) {
890 for (k = 0; k < 16; k++) Ntab[k] = 0;
891 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
892 DO_CONTINGENCY_NUCLEOTIDES
895 /* get the rowwise sums of Ntab */
896 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
897 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
898 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
899 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
901 for (k = 0; k < 16; k++)
902 P12[k] = ((double) Ntab[k]);
904 /* scale each element of P12 by its rowwise sum */
905 for (k = 0; k < 4; k++)
906 for (kb = 0; kb < 16; kb += 4)
907 P12[k + kb] = P12[k + kb]/ROWsums[k];
909 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
911 /* compute the columnwise sums of Ntab: these
912 are the rowwise sums of its transpose */
913 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
914 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
915 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
916 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
918 /* transpose Ntab and store the result in P21 */
919 for (k = 0; k < 4; k++)
920 for (kb = 0; kb < 4; kb++)
921 P21[kb + 4*k] = Ntab[k + 4*kb];
924 for (k = 0; k < 4; k++)
925 for (kb = 0; kb < 16; kb += 4)
926 P21[k + kb] = P21[k + kb]/ROWsums[k];
928 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
933 #define COMPUTE_DIST_ParaLin\
934 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
935 d[target] = -log(detFourByFour(Ftab)/\
936 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
937 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
939 /* For the inversion, we first make U an identity matrix */\
940 for (k = 1; k < 15; k++) U[k] = 0;\
941 U[0] = U[5] = U[10] = U[15] = 1;\
942 /* The matrix is not symmetric, so we use 'dgesv'. */\
943 /* This subroutine puts the result in U. */\
944 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
945 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
946 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
947 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
948 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
949 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
950 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
951 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
952 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
953 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
954 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
955 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
956 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
959 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
960 int *variance, double *var)
962 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
963 double Ftab[16], U[16], *find[4];
967 for (k = 0; k < 4; k++)
968 find[k] = (double*)R_alloc(*n, sizeof(double));
970 for (i1 = 0; i1 < *n; i1++)
971 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
973 for (i1 = 0; i1 < *n; i1++) {
974 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
976 case 136 : find[0][i1]++; break;
977 case 40 : find[1][i1]++; break;
978 case 72 : find[2][i1]++; break;
979 case 24 : find[3][i1]++; break;
982 for (k = 0; k < 4; k++) find[k][i1] /= L;
986 for (i1 = 1; i1 < *n; i1++) {
987 for (i2 = i1 + 1; i2 <= *n; i2++) {
988 for (k = 0; k < 16; k++) Ntab[k] = 0;
989 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
990 DO_CONTINGENCY_NUCLEOTIDES
998 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
999 int *variance, double *var)
1001 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
1002 double Ftab[16], U[16], *find[4];
1006 for (k = 0; k < 4; k++)
1007 find[k] = (double*)R_alloc(*n, sizeof(double));
1009 for (i1 = 0; i1 < *n; i1++)
1010 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
1012 for (i1 = 0; i1 < *n; i1++) {
1014 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
1015 if (KnownBase(x[s1])) {
1018 case 136 : find[0][i1]++; break;
1019 case 40 : find[1][i1]++; break;
1020 case 72 : find[2][i1]++; break;
1021 case 24 : find[3][i1]++; break;
1025 for (k = 0; k < 4; k++) find[k][i1] /= L;
1029 for (i1 = 1; i1 < *n; i1++) {
1030 for (i2 = i1 + 1; i2 <= *n; i2++) {
1032 for (k = 0; k < 16; k++) Ntab[k] = 0;
1033 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
1034 CHECK_PAIRWISE_DELETION
1035 DO_CONTINGENCY_NUCLEOTIDES
1037 COMPUTE_DIST_ParaLin
1043 /* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */
1048 /* for (i = 0; i < *n; i++) { */
1049 /* if (KnownBase(x[i])) { */
1051 /* switch (x[i]) { */
1052 /* case 136 : BF[0]++; break; */
1053 /* case 40 : BF[1]++; break; */
1054 /* case 72 : BF[2]++; break; */
1055 /* case 24 : BF[3]++; break; */
1059 /* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */
1062 /* void BaseProportion(unsigned char *x, int *n, double *BF) */
1066 /* for (i = 0; i < *n; i++) { */
1067 /* switch (x[i]) { */
1068 /* case 136 : BF[0]++; break; */
1069 /* case 40 : BF[1]++; break; */
1070 /* case 72 : BF[2]++; break; */
1071 /* case 24 : BF[3]++; break; */
1072 /* case 192 : BF[4]++; break; */
1073 /* case 160 : BF[5]++; break; */
1074 /* case 144 : BF[6]++; break; */
1075 /* case 96 : BF[7]++; break; */
1076 /* case 80 : BF[8]++; break; */
1077 /* case 48 : BF[9]++; break; */
1078 /* case 224 : BF[10]++; break; */
1079 /* case 176 : BF[11]++; break; */
1080 /* case 208 : BF[12]++; break; */
1081 /* case 112 : BF[13]++; break; */
1082 /* case 240 : BF[14]++; break; */
1083 /* case 4 : BF[15]++; break; */
1084 /* case 2 : BF[16]++; break; */
1089 /* a hash table is much faster than switch (2012-01-10) */
1090 void BaseProportion(unsigned char *x, int *n, double *BF)
1095 memset(count, 0, 256*sizeof(double));
1097 for (i = 0; i < *n; i++) count[x[i]]++;
1109 BF[10] = count[224];
1110 BF[11] = count[176];
1111 BF[12] = count[208];
1112 BF[13] = count[112];
1113 BF[14] = count[240];
1118 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1121 unsigned char basis;
1123 for (j = 0; j < *s; j++) {
1125 while (!KnownBase(x[i])) i++;
1128 while (i < *n * (j + 1)) {
1129 if (!KnownBase(x[i]) || x[i] == basis) i++;
1138 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1142 for (j = 0; j < *s; j++) {
1144 while (i < *n * (j + 1)) {
1145 if (KnownBase(x[i])) i++;
1154 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1155 double *BF, int *pairdel, int *variance, double *var,
1156 int *gamma, double *alpha)
1159 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1160 else distDNA_raw(x, n, s, d, 1); break;
1161 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1162 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1163 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1164 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1165 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1166 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1167 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1168 else distDNA_K81(x, n, s, d, variance, var); break;
1169 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1170 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1171 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1172 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1173 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1174 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1175 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1176 else distDNA_GG95(x, n, s, d, variance, var); break;
1177 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1178 else distDNA_LogDet(x, n, s, d, variance, var); break;
1179 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1180 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1181 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1182 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1183 else distDNA_raw(x, n, s, d, 0); break;
1184 case 14 : if (pairdel) distDNA_TsTv(x, n, s, d, 1, 1);
1185 else distDNA_TsTv(x, n, s, d, 1, 0); break;
1186 case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
1187 else distDNA_TsTv(x, n, s, d, 0, 0); break;
1188 case 16 : distDNA_indel(x, n, s, d); break;
1189 case 17 : distDNA_indelblock(x, n, s, d); break;