1 /* dist_dna.c 2010-03-16 */
3 /* Copyright 2005-2010 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
9 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
67 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
69 int i1, i2, s1, s2, target, Nd;
72 for (i1 = 1; i1 < *n; i1++) {
73 for (i2 = i1 + 1; i2 <= *n; i2++) {
75 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
76 if (DifferentBase(x[s1], x[s2])) Nd++;
77 if (scaled) d[target] = ((double) Nd / *s);
78 else d[target] = ((double) Nd);
84 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
86 int i1, i2, s1, s2, target, Nd, L;
89 for (i1 = 1; i1 < *n; i1++) {
90 for (i2 = i1 + 1; i2 <= *n; i2++) {
92 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
93 CHECK_PAIRWISE_DELETION
94 if (DifferentBase(x[s1], x[s2])) Nd++;
96 if (scaled) d[target] = ((double) Nd/L);
97 else d[target] = ((double) Nd);
103 #define COMPUTE_DIST_JC69\
104 p = ((double) Nd/L);\
106 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
107 else d[target] = -0.75*log(1 - 4*p/3);\
109 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
110 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
113 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
114 int *variance, double *var, int *gamma, double *alpha)
116 int i1, i2, s1, s2, target, Nd, L;
122 for (i1 = 1; i1 < *n; i1++) {
123 for (i2 = i1 + 1; i2 <= *n; i2++) {
125 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
126 if (DifferentBase(x[s1], x[s2])) Nd++;
133 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
134 int *variance, double *var, int *gamma, double *alpha)
136 int i1, i2, s1, s2, target, Nd, L;
140 for (i1 = 1; i1 < *n; i1++) {
141 for (i2 = i1 + 1; i2 <= *n; i2++) {
143 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
144 CHECK_PAIRWISE_DELETION
145 if (DifferentBase(x[s1], x[s2])) Nd++;
154 if (SameBase(x[s1], x[s2])) continue;\
156 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
160 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
162 #define COMPUTE_DIST_K80\
163 P = ((double) Ns/L);\
164 Q = ((double) (Nd - Ns)/L);\
169 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
171 else d[target] = -0.5 * log(a1 * sqrt(a2));\
174 b = -(1 / *alpha + 1);\
183 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
186 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
187 int *variance, double *var, int *gamma, double *alpha)
189 int i1, i2, s1, s2, target, Nd, Ns, L;
190 double P, Q, a1, a2, b, c1, c2, c3;
195 for (i1 = 1; i1 < *n; i1++) {
196 for (i2 = i1 + 1; i2 <= *n; i2++) {
198 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
207 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
208 int *variance, double *var, int *gamma, double *alpha)
210 int i1, i2, s1, s2, target, Nd, Ns, L;
211 double P, Q, a1, a2, b, c1, c2, c3;
214 for (i1 = 1; i1 < *n; i1++) {
215 for (i2 = i1 + 1; i2 <= *n; i2++) {
217 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
218 CHECK_PAIRWISE_DELETION
227 #define COMPUTE_DIST_F81\
228 p = ((double) Nd/L);\
229 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
230 else d[target] = -E*log(1 - p/E);\
232 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
233 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
236 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
237 int *variance, double *var, int *gamma, double *alpha)
239 int i1, i2, s1, s2, target, Nd, L;
243 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
246 for (i1 = 1; i1 < *n; i1++) {
247 for (i2 = i1 + 1; i2 <= *n; i2++) {
249 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
250 if (DifferentBase(x[s1], x[s2])) Nd++;
257 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
258 int *variance, double *var, int *gamma, double *alpha)
260 int i1, i2, s1, s2, target, Nd, L;
263 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
266 for (i1 = 1; i1 < *n; i1++) {
267 for (i2 = i1 + 1; i2 <= *n; i2++) {
269 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
270 CHECK_PAIRWISE_DELETION
271 if (DifferentBase(x[s1], x[s2])) Nd++;
279 #define COUNT_TS_TV1_TV2\
280 if (SameBase(x[s1], x[s2])) continue;\
282 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
286 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
289 #define COMPUTE_DIST_K81\
290 P = ((double) (Nd - Nv1 - Nv2)/L);\
291 Q = ((double) Nv1/L);\
292 R = ((double) Nv2/L);\
296 d[target] = -0.25*log(a1*a2*a3);\
298 a = (1/a1 + 1/a2)/2;\
299 b = (1/a1 + 1/a3)/2;\
300 c = (1/a2 + 1/a3)/2;\
301 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
304 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
305 int *variance, double *var)
307 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
308 double P, Q, R, a1, a2, a3, a, b, c;
313 for (i1 = 1; i1 < *n; i1++) {
314 for (i2 = i1 + 1; i2 <= *n; i2++) {
316 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
325 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
326 int *variance, double *var)
328 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
329 double P, Q, R, a1, a2, a3, a, b, c;
332 for (i1 = 1; i1 < *n; i1++) {
333 for (i2 = i1 + 1; i2 <= *n; i2++) {
334 Nd = Nv1 = Nv2 = L = 0;
335 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
336 CHECK_PAIRWISE_DELETION
345 #define PREPARE_BF_F84\
346 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
347 B = BF[0]*BF[2] + BF[1]*BF[3];\
348 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
350 #define COMPUTE_DIST_F84\
351 P = ((double) Ns/L);\
352 Q = ((double) (Nd - Ns)/L);\
353 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
358 a = t1/(t1 - t2 - t3);\
359 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
360 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
363 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
364 double *BF, int *variance, double *var)
366 int i1, i2, Nd, Ns, L, target, s1, s2;
367 double P, Q, A, B, C, a, b, t1, t2, t3;
373 for (i1 = 1; i1 < *n; i1++) {
374 for (i2 = i1 + 1; i2 <= *n; i2++) {
376 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
385 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
386 double *BF, int *variance, double *var)
388 int i1, i2, Nd, Ns, L, target, s1, s2;
389 double P, Q, A, B, C, a, b, t1, t2, t3;
394 for (i1 = 1; i1 < *n; i1++) {
395 for (i2 = i1 + 1; i2 <= *n; i2++) {
397 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
398 CHECK_PAIRWISE_DELETION
407 #define COMPUTE_DIST_T92\
408 P = ((double) Ns/L);\
409 Q = ((double) (Nd - Ns)/L);\
412 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
416 c3 = wg*(c1 - c2) + c2;\
417 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
420 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
421 double *BF, int *variance, double *var)
423 int i1, i2, Nd, Ns, L, target, s1, s2;
424 double P, Q, wg, a1, a2, c1, c2, c3;
427 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
430 for (i1 = 1; i1 < *n; i1++) {
431 for (i2 = i1 + 1; i2 <= *n; i2++) {
433 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
442 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
443 double *BF, int *variance, double *var)
445 int i1, i2, Nd, Ns, L, target, s1, s2;
446 double P, Q, wg, a1, a2, c1, c2, c3;
448 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
451 for (i1 = 1; i1 < *n; i1++) {
452 for (i2 = i1 + 1; i2 <= *n; i2++) {
454 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
455 CHECK_PAIRWISE_DELETION
464 /* returns 1 if one of the base is adenine and
465 the other one is guanine surely, 0 otherwise */
466 #define AdenineAndGuanine(a, b) (a | b) == 200
468 /* returns 1 if one of the base is cytosine and
469 the other one is thymine surely, 0 otherwise */
470 #define CytosineAndThymine(a, b) (a | b) == 56
472 #define PREPARE_BF_TN93\
475 k1 = 2 * BF[0] * BF[2] / gR;\
476 k2 = 2 * BF[1] * BF[3] / gY;\
477 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
479 #define COUNT_TS1_TS2_TV\
480 if (DifferentBase(x[s1], x[s2])) {\
482 if (AdenineAndGuanine(x[s1], x[s2])) {\
486 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
489 #define COMPUTE_DIST_TN93\
490 P1 = ((double) Ns1/L);\
491 P2 = ((double) Ns2/L);\
492 Q = ((double) (Nd - Ns1 - Ns2)/L);\
493 w1 = 1 - P1/k1 - Q/(2*gR);\
494 w2 = 1 - P2/k2 - Q/(2*gY);\
495 w3 = 1 - Q/(2*gR*gY);\
497 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
502 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
503 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
505 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
509 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
510 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
513 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
515 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
516 double *BF, int *variance, double *var,
517 int *gamma, double *alpha)
519 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
520 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
527 for (i1 = 1; i1 < *n; i1++) {
528 for (i2 = i1 + 1; i2 <= *n; i2++) {
530 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
539 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
540 double *BF, int *variance, double *var,
541 int *gamma, double *alpha)
543 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
544 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
549 for (i1 = 1; i1 < *n; i1++) {
550 for (i2 = i1 + 1; i2 <= *n; i2++) {
551 Nd = Ns1 = Ns2 = L = 0;
552 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
553 CHECK_PAIRWISE_DELETION
562 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
563 int *variance, double *var)
565 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
566 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
573 npair = *n * (*n - 1) / 2;
575 theta = (double*)R_alloc(*n, sizeof(double));
576 P = (double*)R_alloc(npair, sizeof(double));
577 Q = (double*)R_alloc(npair, sizeof(double));
578 tstvr = (double*)R_alloc(npair, sizeof(double));
580 /* get the proportion of GC (= theta) in each sequence */
581 for (i1 = 1; i1 <= *n; i1++) {
583 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
584 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
585 theta[i1 - 1] = ((double) GC / *s);
588 /* get the proportions of transitions and transversions,
589 and the estimates of their ratio for each pair */
591 for (i1 = 1; i1 < *n; i1++) {
592 for (i2 = i1 + 1; i2 <= *n; i2++) {
594 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
597 P[target] = ((double) Ns / *s);
598 Q[target] = ((double) (Nd - Ns) / *s);
599 A = log(1 - 2*Q[target]);
600 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
605 /* compute the mean alpha (ma) = mean Ts/Tv */
608 for (i1 = 0; i1 < npair; i1++)
609 /* some values of tstvr are -Inf if there is no
610 transversions observed: we exclude them */
611 if (R_FINITE(tstvr[i1])) {
617 /* compute the distance for each pair */
619 for (i1 = 1; i1 < *n; i1++) {
620 for (i2 = i1 + 1; i2 <= *n; i2++) {
622 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
623 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
624 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
626 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
632 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
633 int *variance, double *var)
635 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
636 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
644 npair = *n * (*n - 1) / 2;
646 theta = (double*)R_alloc(*n, sizeof(double));
647 L = (int*)R_alloc(npair, sizeof(int));
648 P = (double*)R_alloc(npair, sizeof(double));
649 Q = (double*)R_alloc(npair, sizeof(double));
650 tstvr = (double*)R_alloc(npair, sizeof(double));
652 /* get the proportion of GC (= theta) in each sequence */
653 for (i1 = 1; i1 <= *n; i1++) {
655 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
656 if (KnownBase(x[s1])) tl++;
658 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
660 theta[i1 - 1] = ((double) GC / tl);
663 /* get the proportions of transitions and transversions,
664 and the estimates of their ratio for each pair; we
665 also get the sample size for each pair in L */
667 for (i1 = 1; i1 < *n; i1++) {
668 for (i2 = i1 + 1; i2 <= *n; i2++) {
669 Nd = Ns = L[target] = 0;
670 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
671 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
675 P[target] = ((double) Ns/L[target]);
676 Q[target] = ((double) (Nd - Ns)/L[target]);
677 A = log(1 - 2*Q[target]);
678 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
683 /* compute the mean alpha (ma) = mean Ts/Tv */
686 for (i1 = 0; i1 < npair; i1++)
687 /* some values of tstvr are -Inf if there is no
688 transversions observed: we exclude them */
689 if (R_FINITE(tstvr[i1])) {
695 /* compute the distance for each pair */
697 for (i1 = 1; i1 < *n; i1++) {
698 for (i2 = i1 + 1; i2 <= *n; i2++) {
700 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
701 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
702 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
704 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
710 #define DO_CONTINGENCY_NUCLEOTIDES\
712 case 136 : m = 0; break;\
713 case 72 : m = 1; break;\
714 case 40 : m = 2; break;\
715 case 24 : m = 3; break;\
718 case 72 : m += 4; break;\
719 case 40 : m += 8; break;\
720 case 24 : m += 12; break;\
724 #define COMPUTE_DIST_LogDet\
725 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
726 d[target] = (-log(detFourByFour(Ftab))/4 - LN4)/4;\
728 /* For the inversion, we first make U an identity matrix */\
729 for (k = 1; k < 15; k++) U[k] = 0;\
730 U[0] = U[5] = U[10] = U[15] = 1;\
731 /* The matrix is not symmetric, so we use 'dgesv'. */\
732 /* This subroutine puts the result in U. */\
733 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
734 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
735 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
736 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
737 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
738 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
739 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
740 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
741 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
744 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
745 int *variance, double *var)
747 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
748 double Ftab[16], U[16];
753 for (i1 = 1; i1 < *n; i1++) {
754 for (i2 = i1 + 1; i2 <= *n; i2++) {
755 for (k = 0; k < 16; k++) Ntab[k] = 0;
756 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
757 DO_CONTINGENCY_NUCLEOTIDES
765 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
766 int *variance, double *var)
768 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
769 double Ftab[16], U[16];
772 for (i1 = 1; i1 < *n; i1++) {
773 for (i2 = i1 + 1; i2 <= *n; i2++) {
774 for (k = 0; k < 16; k++) Ntab[k] = 0;
776 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
777 CHECK_PAIRWISE_DELETION
778 DO_CONTINGENCY_NUCLEOTIDES
786 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
787 int *variance, double *var)
789 For the moment there is no need to check for pairwise deletions
790 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
791 In effect the pairwise deletion has possibly been done before.
792 The sequence length(s) are used only to compute the variances, which is
793 currently not available.
796 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4], ndim = 4, info, ipiv[16];
797 double P12[16], P21[16], U[16];
799 for (i1 = 1; i1 < *n; i1++) {
800 for (i2 = i1 + 1; i2 <= *n; i2++) {
801 for (k = 0; k < 16; k++) Ntab[k] = 0;
802 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
803 #define PREPARE_BF_F84\
804 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
805 B = BF[0]*BF[2] + BF[1]*BF[3];\
806 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
808 #define COMPUTE_DIST_F84\
809 P = ((double) Ns/L);\
810 Q = ((double) (Nd - Ns)/L);\
811 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
816 a = t1/(t1 - t2 - t3);\
817 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
818 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/2;\
821 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
822 double *BF, int *variance, double *var)
824 int i1, i2, Nd, Ns, L, target, s1, s2;
825 double P, Q, A, B, C, a, b, t1, t2, t3;
831 for (i1 = 1; i1 < *n; i1++) {
832 for (i2 = i1 + 1; i2 <= *n; i2++) {
834 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
843 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
844 double *BF, int *variance, double *var)
846 int i1, i2, Nd, Ns, L, target, s1, s2;
847 double P, Q, A, B, C, a, b, t1, t2, t3;
852 for (i1 = 1; i1 < *n; i1++) {
853 for (i2 = i1 + 1; i2 <= *n; i2++) {
855 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
856 CHECK_PAIRWISE_DELETION
864 DO_CONTINGENCY_NUCLEOTIDES
867 /* get the rowwise sums of Ntab */
868 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
869 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
870 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
871 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
873 for (k = 0; k < 16; k++)
874 P12[k] = ((double) Ntab[k]);
876 /* scale each element of P12 by its rowwise sum */
877 for (k = 0; k < 4; k++)
878 for (kb = 0; kb < 16; kb += 4)
879 P12[k + kb] = P12[k + kb]/ROWsums[k];
881 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
883 /* compute the columnwise sums of Ntab: these
884 are the rowwise sums of its transpose */
885 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
886 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
887 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
888 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
890 /* transpose Ntab and store the result in P21 */
891 for (k = 0; k < 4; k++)
892 for (kb = 0; kb < 4; kb++)
893 P21[kb + 4*k] = Ntab[k + 4*kb];
896 for (k = 0; k < 4; k++)
897 for (kb = 0; kb < 16; kb += 4)
898 P21[k + kb] = P21[k + kb]/ROWsums[k];
900 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
905 #define COMPUTE_DIST_ParaLin\
906 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
907 d[target] = -log(detFourByFour(Ftab)/\
908 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
909 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
911 /* For the inversion, we first make U an identity matrix */\
912 for (k = 1; k < 15; k++) U[k] = 0;\
913 U[0] = U[5] = U[10] = U[15] = 1;\
914 /* The matrix is not symmetric, so we use 'dgesv'. */\
915 /* This subroutine puts the result in U. */\
916 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
917 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
918 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
919 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
920 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
921 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
922 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
923 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
924 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
925 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
926 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
927 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
928 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
931 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
932 int *variance, double *var)
934 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
935 double Ftab[16], U[16], *find[4];
939 for (k = 0; k < 4; k++)
940 find[k] = (double*)R_alloc(*n, sizeof(double));
942 for (i1 = 0; i1 < *n; i1++)
943 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
945 for (i1 = 0; i1 < *n; i1++) {
946 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
948 case 136 : find[0][i1]++; break;
949 case 40 : find[1][i1]++; break;
950 case 72 : find[2][i1]++; break;
951 case 24 : find[3][i1]++; break;
954 for (k = 0; k < 4; k++) find[k][i1] /= L;
958 for (i1 = 1; i1 < *n; i1++) {
959 for (i2 = i1 + 1; i2 <= *n; i2++) {
960 for (k = 0; k < 16; k++) Ntab[k] = 0;
961 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
962 DO_CONTINGENCY_NUCLEOTIDES
970 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
971 int *variance, double *var)
973 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
974 double Ftab[16], U[16], *find[4];
978 for (k = 0; k < 4; k++)
979 find[k] = (double*)R_alloc(*n, sizeof(double));
981 for (i1 = 0; i1 < *n; i1++)
982 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
984 for (i1 = 0; i1 < *n; i1++) {
986 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
987 if (KnownBase(x[s1])) {
990 case 136 : find[0][i1]++; break;
991 case 40 : find[1][i1]++; break;
992 case 72 : find[2][i1]++; break;
993 case 24 : find[3][i1]++; break;
997 for (k = 0; k < 4; k++) find[k][i1] /= L;
1001 for (i1 = 1; i1 < *n; i1++) {
1002 for (i2 = i1 + 1; i2 <= *n; i2++) {
1004 for (k = 0; k < 16; k++) Ntab[k] = 0;
1005 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
1006 CHECK_PAIRWISE_DELETION
1007 DO_CONTINGENCY_NUCLEOTIDES
1009 COMPUTE_DIST_ParaLin
1015 void BaseProportion(unsigned char *x, int *n, double *BF, int *freq)
1020 for (i = 0; i < *n; i++) {
1021 if (KnownBase(x[i])) {
1024 case 136 : BF[0]++; break;
1025 case 40 : BF[1]++; break;
1026 case 72 : BF[2]++; break;
1027 case 24 : BF[3]++; break;
1031 if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m;
1034 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1037 unsigned char basis;
1039 for (j = 0; j < *s; j++) {
1041 while (!KnownBase(x[i])) i++;
1044 while (i < *n * (j + 1)) {
1045 if (!KnownBase(x[i]) || x[i] == basis) i++;
1054 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1058 for (j = 0; j < *s; j++) {
1060 while (i < *n * (j + 1)) {
1061 if (KnownBase(x[i])) i++;
1070 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1071 double *BF, int *pairdel, int *variance, double *var,
1072 int *gamma, double *alpha)
1075 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1076 else distDNA_raw(x, n, s, d, 1); break;
1078 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1079 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1081 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1082 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1084 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1085 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1087 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1088 else distDNA_K81(x, n, s, d, variance, var); break;
1090 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1091 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1093 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1094 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1096 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1097 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1099 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1100 else distDNA_GG95(x, n, s, d, variance, var); break;
1102 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1103 else distDNA_LogDet(x, n, s, d, variance, var); break;
1105 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1107 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1108 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1109 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1110 else distDNA_raw(x, n, s, d, 0); break;