1 /* dist_dna.c 2012-11-28 */
3 /* Copyright 2005-2012 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
8 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
68 if (SameBase(x[s1], x[s2])) continue;\
70 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
74 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
76 void distDNA_indel(unsigned char *x, int *n, int *s, double *d)
78 int i1, i2, s1, s2, target, N;
81 for (i1 = 1; i1 < *n; i1++) {
82 for (i2 = i1 + 1; i2 <= *n; i2++) {
85 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
86 if ((x[s1] ^ x[s2]) & 4) N++;
88 d[target] = ((double) N);
94 #define X(i, j) i - 1 + *n * (j - 1)
96 void distDNA_indelblock(unsigned char *x, int *n, int *s, double *d)
98 int i1, i2, s1, s2, target, N, start_block, end_block;
100 for (i1 = 1; i1 <= *n; i1++) {
102 /* find a block of one or more '-' */
106 if (x[X(i1, s1)] == 4) {
108 while (x[X(i1, s1)] == 4) s1++;
111 /* then look if the same block is present in all the other sequences */
113 for (i2 = 1; i2 <= *n; i2++) {
114 if (i2 == i1) continue;
116 target = give_index(i1, i2, *n);
118 if (start_block > 1) {
119 if (x[X(i2, start_block - 1)] == 4) {
124 if (end_block < *s) {
125 if (x[X(i2, end_block + 1)] == 4) {
130 for (s2 = start_block; s2 <= end_block; s2++) {
131 if (x[X(i2, s2)] != 4) {
146 void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
148 int i1, i2, s1, s2, target, Nd, Ns;
151 for (i1 = 1; i1 < *n; i1++) {
152 for (i2 = i1 + 1; i2 <= *n; i2++) {
154 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
155 if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
158 if (Ts) d[target] = ((double) Ns); /* output number of transitions */
159 else d[target] = ((double) Nd - Ns); /* output number of transversions */
165 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
167 int i1, i2, s1, s2, target, Nd;
170 for (i1 = 1; i1 < *n; i1++) {
171 for (i2 = i1 + 1; i2 <= *n; i2++) {
173 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
174 if (DifferentBase(x[s1], x[s2])) Nd++;
175 if (scaled) d[target] = ((double) Nd / *s);
176 else d[target] = ((double) Nd);
182 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
184 int i1, i2, s1, s2, target, Nd, L;
187 for (i1 = 1; i1 < *n; i1++) {
188 for (i2 = i1 + 1; i2 <= *n; i2++) {
190 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
191 CHECK_PAIRWISE_DELETION
192 if (DifferentBase(x[s1], x[s2])) Nd++;
194 if (scaled) d[target] = ((double) Nd/L);
195 else d[target] = ((double) Nd);
201 #define COMPUTE_DIST_JC69\
202 p = ((double) Nd/L);\
204 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
205 else d[target] = -0.75*log(1 - 4*p/3);\
207 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
208 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
211 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
212 int *variance, double *var, int *gamma, double *alpha)
214 int i1, i2, s1, s2, target, Nd, L;
220 for (i1 = 1; i1 < *n; i1++) {
221 for (i2 = i1 + 1; i2 <= *n; i2++) {
223 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
224 if (DifferentBase(x[s1], x[s2])) Nd++;
231 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
232 int *variance, double *var, int *gamma, double *alpha)
234 int i1, i2, s1, s2, target, Nd, L;
238 for (i1 = 1; i1 < *n; i1++) {
239 for (i2 = i1 + 1; i2 <= *n; i2++) {
241 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
242 CHECK_PAIRWISE_DELETION
243 if (DifferentBase(x[s1], x[s2])) Nd++;
251 #define COMPUTE_DIST_K80\
252 P = ((double) Ns/L);\
253 Q = ((double) (Nd - Ns)/L);\
258 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
260 else d[target] = -0.5 * log(a1 * sqrt(a2));\
263 b = -(1 / *alpha + 1);\
272 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
275 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
276 int *variance, double *var, int *gamma, double *alpha)
278 int i1, i2, s1, s2, target, Nd, Ns, L;
279 double P, Q, a1, a2, b, c1, c2, c3;
284 for (i1 = 1; i1 < *n; i1++) {
285 for (i2 = i1 + 1; i2 <= *n; i2++) {
287 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
296 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
297 int *variance, double *var, int *gamma, double *alpha)
299 int i1, i2, s1, s2, target, Nd, Ns, L;
300 double P, Q, a1, a2, b, c1, c2, c3;
303 for (i1 = 1; i1 < *n; i1++) {
304 for (i2 = i1 + 1; i2 <= *n; i2++) {
306 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
307 CHECK_PAIRWISE_DELETION
316 #define COMPUTE_DIST_F81\
317 p = ((double) Nd/L);\
318 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
319 else d[target] = -E*log(1 - p/E);\
321 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
322 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
325 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
326 int *variance, double *var, int *gamma, double *alpha)
328 int i1, i2, s1, s2, target, Nd, L;
332 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
335 for (i1 = 1; i1 < *n; i1++) {
336 for (i2 = i1 + 1; i2 <= *n; i2++) {
338 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
339 if (DifferentBase(x[s1], x[s2])) Nd++;
346 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
347 int *variance, double *var, int *gamma, double *alpha)
349 int i1, i2, s1, s2, target, Nd, L;
352 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
355 for (i1 = 1; i1 < *n; i1++) {
356 for (i2 = i1 + 1; i2 <= *n; i2++) {
358 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
359 CHECK_PAIRWISE_DELETION
360 if (DifferentBase(x[s1], x[s2])) Nd++;
368 #define COUNT_TS_TV1_TV2\
369 if (SameBase(x[s1], x[s2])) continue;\
371 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
375 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
378 #define COMPUTE_DIST_K81\
379 P = ((double) (Nd - Nv1 - Nv2)/L);\
380 Q = ((double) Nv1/L);\
381 R = ((double) Nv2/L);\
385 d[target] = -0.25*log(a1*a2*a3);\
387 a = (1/a1 + 1/a2)/2;\
388 b = (1/a1 + 1/a3)/2;\
389 c = (1/a2 + 1/a3)/2;\
390 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
393 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
394 int *variance, double *var)
396 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
397 double P, Q, R, a1, a2, a3, a, b, c;
402 for (i1 = 1; i1 < *n; i1++) {
403 for (i2 = i1 + 1; i2 <= *n; i2++) {
405 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
414 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
415 int *variance, double *var)
417 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
418 double P, Q, R, a1, a2, a3, a, b, c;
421 for (i1 = 1; i1 < *n; i1++) {
422 for (i2 = i1 + 1; i2 <= *n; i2++) {
423 Nd = Nv1 = Nv2 = L = 0;
424 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
425 CHECK_PAIRWISE_DELETION
434 #define PREPARE_BF_F84\
435 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
436 B = BF[0]*BF[2] + BF[1]*BF[3];\
437 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
439 #define COMPUTE_DIST_F84\
440 P = ((double) Ns/L);\
441 Q = ((double) (Nd - Ns)/L);\
442 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
447 a = t1/(t1 - t2 - t3);\
448 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
449 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
452 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
453 double *BF, int *variance, double *var)
455 int i1, i2, Nd, Ns, L, target, s1, s2;
456 double P, Q, A, B, C, a, b, t1, t2, t3;
462 for (i1 = 1; i1 < *n; i1++) {
463 for (i2 = i1 + 1; i2 <= *n; i2++) {
465 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
474 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
475 double *BF, int *variance, double *var)
477 int i1, i2, Nd, Ns, L, target, s1, s2;
478 double P, Q, A, B, C, a, b, t1, t2, t3;
483 for (i1 = 1; i1 < *n; i1++) {
484 for (i2 = i1 + 1; i2 <= *n; i2++) {
486 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
487 CHECK_PAIRWISE_DELETION
496 #define COMPUTE_DIST_T92\
497 P = ((double) Ns/L);\
498 Q = ((double) (Nd - Ns)/L);\
501 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
505 c3 = wg*(c1 - c2) + c2;\
506 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
509 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
510 double *BF, int *variance, double *var)
512 int i1, i2, Nd, Ns, L, target, s1, s2;
513 double P, Q, wg, a1, a2, c1, c2, c3;
516 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
519 for (i1 = 1; i1 < *n; i1++) {
520 for (i2 = i1 + 1; i2 <= *n; i2++) {
522 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
531 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
532 double *BF, int *variance, double *var)
534 int i1, i2, Nd, Ns, L, target, s1, s2;
535 double P, Q, wg, a1, a2, c1, c2, c3;
537 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
540 for (i1 = 1; i1 < *n; i1++) {
541 for (i2 = i1 + 1; i2 <= *n; i2++) {
543 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
544 CHECK_PAIRWISE_DELETION
553 /* returns 1 if one of the base is adenine and
554 the other one is guanine surely, 0 otherwise */
555 #define AdenineAndGuanine(a, b) (a | b) == 200
557 /* returns 1 if one of the base is cytosine and
558 the other one is thymine surely, 0 otherwise */
559 #define CytosineAndThymine(a, b) (a | b) == 56
561 #define PREPARE_BF_TN93\
564 k1 = 2 * BF[0] * BF[2] / gR;\
565 k2 = 2 * BF[1] * BF[3] / gY;\
566 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
568 #define COUNT_TS1_TS2_TV\
569 if (DifferentBase(x[s1], x[s2])) {\
571 if (AdenineAndGuanine(x[s1], x[s2])) {\
575 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
578 #define COMPUTE_DIST_TN93\
579 P1 = ((double) Ns1/L);\
580 P2 = ((double) Ns2/L);\
581 Q = ((double) (Nd - Ns1 - Ns2)/L);\
582 w1 = 1 - P1/k1 - Q/(2*gR);\
583 w2 = 1 - P2/k2 - Q/(2*gY);\
584 w3 = 1 - Q/(2*gR*gY);\
586 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
591 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
592 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
594 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
598 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
599 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
602 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
604 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
605 double *BF, int *variance, double *var,
606 int *gamma, double *alpha)
608 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
609 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
616 for (i1 = 1; i1 < *n; i1++) {
617 for (i2 = i1 + 1; i2 <= *n; i2++) {
619 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
628 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
629 double *BF, int *variance, double *var,
630 int *gamma, double *alpha)
632 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
633 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
638 for (i1 = 1; i1 < *n; i1++) {
639 for (i2 = i1 + 1; i2 <= *n; i2++) {
640 Nd = Ns1 = Ns2 = L = 0;
641 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
642 CHECK_PAIRWISE_DELETION
651 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
652 int *variance, double *var)
654 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
655 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
662 npair = *n * (*n - 1) / 2;
664 theta = (double*)R_alloc(*n, sizeof(double));
665 P = (double*)R_alloc(npair, sizeof(double));
666 Q = (double*)R_alloc(npair, sizeof(double));
667 tstvr = (double*)R_alloc(npair, sizeof(double));
669 /* get the proportion of GC (= theta) in each sequence */
670 for (i1 = 1; i1 <= *n; i1++) {
672 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
673 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
674 theta[i1 - 1] = ((double) GC / *s);
677 /* get the proportions of transitions and transversions,
678 and the estimates of their ratio for each pair */
680 for (i1 = 1; i1 < *n; i1++) {
681 for (i2 = i1 + 1; i2 <= *n; i2++) {
683 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
686 P[target] = ((double) Ns / *s);
687 Q[target] = ((double) (Nd - Ns) / *s);
688 A = log(1 - 2*Q[target]);
689 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
694 /* compute the mean alpha (ma) = mean Ts/Tv */
697 for (i1 = 0; i1 < npair; i1++)
698 /* some values of tstvr are -Inf if there is no
699 transversions observed: we exclude them */
700 if (R_FINITE(tstvr[i1])) {
706 /* compute the distance for each pair */
708 for (i1 = 1; i1 < *n; i1++) {
709 for (i2 = i1 + 1; i2 <= *n; i2++) {
711 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
712 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
713 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
715 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
721 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
722 int *variance, double *var)
724 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
725 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
733 npair = *n * (*n - 1) / 2;
735 theta = (double*)R_alloc(*n, sizeof(double));
736 L = (int*)R_alloc(npair, sizeof(int));
737 P = (double*)R_alloc(npair, sizeof(double));
738 Q = (double*)R_alloc(npair, sizeof(double));
739 tstvr = (double*)R_alloc(npair, sizeof(double));
741 /* get the proportion of GC (= theta) in each sequence */
742 for (i1 = 1; i1 <= *n; i1++) {
744 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
745 if (KnownBase(x[s1])) tl++;
747 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
749 theta[i1 - 1] = ((double) GC / tl);
752 /* get the proportions of transitions and transversions,
753 and the estimates of their ratio for each pair; we
754 also get the sample size for each pair in L */
756 for (i1 = 1; i1 < *n; i1++) {
757 for (i2 = i1 + 1; i2 <= *n; i2++) {
758 Nd = Ns = L[target] = 0;
759 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
760 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
764 P[target] = ((double) Ns/L[target]);
765 Q[target] = ((double) (Nd - Ns)/L[target]);
766 A = log(1 - 2*Q[target]);
767 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
772 /* compute the mean alpha (ma) = mean Ts/Tv */
775 for (i1 = 0; i1 < npair; i1++)
776 /* some values of tstvr are -Inf if there is no
777 transversions observed: we exclude them */
778 if (R_FINITE(tstvr[i1])) {
784 /* compute the distance for each pair */
786 for (i1 = 1; i1 < *n; i1++) {
787 for (i2 = i1 + 1; i2 <= *n; i2++) {
789 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
790 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
791 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
793 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
799 #define DO_CONTINGENCY_NUCLEOTIDES\
801 case 136 : m = 0; break;\
802 case 72 : m = 1; break;\
803 case 40 : m = 2; break;\
804 case 24 : m = 3; break;\
807 case 72 : m += 4; break;\
808 case 40 : m += 8; break;\
809 case 24 : m += 12; break;\
813 #define COMPUTE_DIST_LogDet\
814 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
815 d[target] = -log(detFourByFour(Ftab))/4 - LN4;\
817 /* For the inversion, we first make U an identity matrix */\
818 for (k = 1; k < 15; k++) U[k] = 0;\
819 U[0] = U[5] = U[10] = U[15] = 1;\
820 /* The matrix is not symmetric, so we use 'dgesv'. */\
821 /* This subroutine puts the result in U. */\
822 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
823 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
824 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
825 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
826 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
827 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
828 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
829 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
830 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
833 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
834 int *variance, double *var)
836 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
837 double Ftab[16], U[16];
842 for (i1 = 1; i1 < *n; i1++) {
843 for (i2 = i1 + 1; i2 <= *n; i2++) {
844 for (k = 0; k < 16; k++) Ntab[k] = 0;
845 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
846 DO_CONTINGENCY_NUCLEOTIDES
854 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
855 int *variance, double *var)
857 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
858 double Ftab[16], U[16];
861 for (i1 = 1; i1 < *n; i1++) {
862 for (i2 = i1 + 1; i2 <= *n; i2++) {
863 for (k = 0; k < 16; k++) Ntab[k] = 0;
865 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
866 CHECK_PAIRWISE_DELETION
867 DO_CONTINGENCY_NUCLEOTIDES
875 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
876 int *variance, double *var)
878 For the moment there is no need to check for pairwise deletions
879 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
880 In effect the pairwise deletion has possibly been done before.
881 The sequence length(s) are used only to compute the variances, which is
882 currently not available.
885 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4];
886 double P12[16], P21[16];
888 for (i1 = 1; i1 < *n; i1++) {
889 for (i2 = i1 + 1; i2 <= *n; i2++) {
890 for (k = 0; k < 16; k++) Ntab[k] = 0;
891 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
892 DO_CONTINGENCY_NUCLEOTIDES
895 /* get the rowwise sums of Ntab */
896 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
897 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
898 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
899 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
901 for (k = 0; k < 16; k++)
902 P12[k] = ((double) Ntab[k]);
904 /* scale each element of P12 by its rowwise sum */
905 for (k = 0; k < 4; k++)
906 for (kb = 0; kb < 16; kb += 4)
907 P12[k + kb] = P12[k + kb]/ROWsums[k];
909 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
911 /* compute the columnwise sums of Ntab: these
912 are the rowwise sums of its transpose */
913 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
914 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
915 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
916 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
918 /* transpose Ntab and store the result in P21 */
919 for (k = 0; k < 4; k++)
920 for (kb = 0; kb < 4; kb++)
921 P21[kb + 4*k] = Ntab[k + 4*kb];
924 for (k = 0; k < 4; k++)
925 for (kb = 0; kb < 16; kb += 4)
926 P21[k + kb] = P21[k + kb]/ROWsums[k];
928 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
933 #define COMPUTE_DIST_ParaLin\
934 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
935 d[target] = -log(detFourByFour(Ftab)/\
936 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
937 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
939 /* For the inversion, we first make U an identity matrix */\
940 for (k = 1; k < 15; k++) U[k] = 0;\
941 U[0] = U[5] = U[10] = U[15] = 1;\
942 /* The matrix is not symmetric, so we use 'dgesv'. */\
943 /* This subroutine puts the result in U. */\
944 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
945 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
946 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
947 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
948 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
949 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
950 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
951 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
952 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
953 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
954 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
955 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
956 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
959 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
960 int *variance, double *var)
962 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
963 double Ftab[16], U[16], *find[4];
967 for (k = 0; k < 4; k++)
968 find[k] = (double*)R_alloc(*n, sizeof(double));
970 for (i1 = 0; i1 < *n; i1++)
971 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
973 for (i1 = 0; i1 < *n; i1++) {
974 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
976 case 136 : find[0][i1]++; break;
977 case 40 : find[1][i1]++; break;
978 case 72 : find[2][i1]++; break;
979 case 24 : find[3][i1]++; break;
982 for (k = 0; k < 4; k++) find[k][i1] /= L;
986 for (i1 = 1; i1 < *n; i1++) {
987 for (i2 = i1 + 1; i2 <= *n; i2++) {
988 for (k = 0; k < 16; k++) Ntab[k] = 0;
989 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
990 DO_CONTINGENCY_NUCLEOTIDES
998 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
999 int *variance, double *var)
1001 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
1002 double Ftab[16], U[16], *find[4];
1006 for (k = 0; k < 4; k++)
1007 find[k] = (double*)R_alloc(*n, sizeof(double));
1009 for (i1 = 0; i1 < *n; i1++)
1010 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
1012 for (i1 = 0; i1 < *n; i1++) {
1014 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
1015 if (KnownBase(x[s1])) {
1018 case 136 : find[0][i1]++; break;
1019 case 40 : find[1][i1]++; break;
1020 case 72 : find[2][i1]++; break;
1021 case 24 : find[3][i1]++; break;
1025 for (k = 0; k < 4; k++) find[k][i1] /= L;
1029 for (i1 = 1; i1 < *n; i1++) {
1030 for (i2 = i1 + 1; i2 <= *n; i2++) {
1032 for (k = 0; k < 16; k++) Ntab[k] = 0;
1033 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
1034 CHECK_PAIRWISE_DELETION
1035 DO_CONTINGENCY_NUCLEOTIDES
1037 COMPUTE_DIST_ParaLin
1043 /* a hash table is much faster than switch (2012-01-10) */
1044 void BaseProportion(unsigned char *x, int *n, double *BF)
1049 memset(count, 0, 256*sizeof(double));
1051 for (i = 0; i < *n; i++) count[x[i]]++;
1063 BF[10] = count[224];
1064 BF[11] = count[176];
1065 BF[12] = count[208];
1066 BF[13] = count[112];
1067 BF[14] = count[240];
1072 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1075 unsigned char basis;
1077 for (j = 0; j < *s; j++) {
1079 while (!KnownBase(x[i])) i++;
1082 while (i < *n * (j + 1)) {
1083 if (!KnownBase(x[i]) || x[i] == basis) i++;
1092 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1096 for (j = 0; j < *s; j++) {
1098 while (i < *n * (j + 1)) {
1099 if (KnownBase(x[i])) i++;
1108 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1109 double *BF, int *pairdel, int *variance, double *var,
1110 int *gamma, double *alpha)
1113 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1114 else distDNA_raw(x, n, s, d, 1); break;
1115 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1116 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1117 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1118 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1119 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1120 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1121 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1122 else distDNA_K81(x, n, s, d, variance, var); break;
1123 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1124 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1125 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1126 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1127 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1128 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1129 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1130 else distDNA_GG95(x, n, s, d, variance, var); break;
1131 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1132 else distDNA_LogDet(x, n, s, d, variance, var); break;
1133 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1134 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1135 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1136 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1137 else distDNA_raw(x, n, s, d, 0); break;
1138 case 14 : if (pairdel) distDNA_TsTv(x, n, s, d, 1, 1);
1139 else distDNA_TsTv(x, n, s, d, 1, 0); break;
1140 case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
1141 else distDNA_TsTv(x, n, s, d, 0, 0); break;
1142 case 16 : distDNA_indel(x, n, s, d); break;
1143 case 17 : distDNA_indelblock(x, n, s, d); break;
1147 void where(unsigned char *x, unsigned char *pat, int *s, int *p,
1152 ln = 0; /* local n */
1154 for (i = 0; i <= *s - *p; i++) {
1157 if (x[k] != pat[j]) break;