1 /* dist_dna.c 2009-09-16 */
3 /* Copyright 2005-2008 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
9 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
67 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
69 int i1, i2, s1, s2, target, Nd;
72 for (i1 = 1; i1 < *n; i1++) {
73 for (i2 = i1 + 1; i2 <= *n; i2++) {
75 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
76 if (DifferentBase(x[s1], x[s2])) Nd++;
77 if (scaled) d[target] = ((double) Nd / *s);
78 else d[target] = ((double) Nd);
84 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
86 int i1, i2, s1, s2, target, Nd, L;
89 for (i1 = 1; i1 < *n; i1++) {
90 for (i2 = i1 + 1; i2 <= *n; i2++) {
92 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
93 CHECK_PAIRWISE_DELETION
94 if (DifferentBase(x[s1], x[s2])) Nd++;
96 if (scaled) d[target] = ((double) Nd/L);
97 else d[target] = ((double) Nd);
103 #define COMPUTE_DIST_JC69\
104 p = ((double) Nd/L);\
106 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
107 else d[target] = -0.75*log(1 - 4*p/3);\
109 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
110 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
113 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
114 int *variance, double *var, int *gamma, double *alpha)
116 int i1, i2, s1, s2, target, Nd, L;
122 for (i1 = 1; i1 < *n; i1++) {
123 for (i2 = i1 + 1; i2 <= *n; i2++) {
125 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
126 if (DifferentBase(x[s1], x[s2])) Nd++;
133 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
134 int *variance, double *var, int *gamma, double *alpha)
136 int i1, i2, s1, s2, target, Nd, L;
140 for (i1 = 1; i1 < *n; i1++) {
141 for (i2 = i1 + 1; i2 <= *n; i2++) {
143 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
144 CHECK_PAIRWISE_DELETION
145 if (DifferentBase(x[s1], x[s2])) Nd++;
154 if (SameBase(x[s1], x[s2])) continue;\
156 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
160 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
162 #define COMPUTE_DIST_K80\
163 P = ((double) Ns/L);\
164 Q = ((double) (Nd - Ns)/L);\
169 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
171 else d[target] = -0.5 * log(a1 * sqrt(a2));\
174 b = -(1 / *alpha + 1);\
183 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
186 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
187 int *variance, double *var, int *gamma, double *alpha)
189 int i1, i2, s1, s2, target, Nd, Ns, L;
190 double P, Q, a1, a2, b, c1, c2, c3;
195 for (i1 = 1; i1 < *n; i1++) {
196 for (i2 = i1 + 1; i2 <= *n; i2++) {
198 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
207 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
208 int *variance, double *var, int *gamma, double *alpha)
210 int i1, i2, s1, s2, target, Nd, Ns, L;
211 double P, Q, a1, a2, b, c1, c2, c3;
214 for (i1 = 1; i1 < *n; i1++) {
215 for (i2 = i1 + 1; i2 <= *n; i2++) {
217 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
218 CHECK_PAIRWISE_DELETION
227 #define COMPUTE_DIST_F81\
228 p = ((double) Nd/L);\
229 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
230 else d[target] = -E*log(1 - p/E);\
232 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
233 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
236 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
237 int *variance, double *var, int *gamma, double *alpha)
239 int i1, i2, s1, s2, target, Nd, L;
243 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
246 for (i1 = 1; i1 < *n; i1++) {
247 for (i2 = i1 + 1; i2 <= *n; i2++) {
249 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
250 if (DifferentBase(x[s1], x[s2])) Nd++;
257 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
258 int *variance, double *var, int *gamma, double *alpha)
260 int i1, i2, s1, s2, target, Nd, L;
263 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
266 for (i1 = 1; i1 < *n; i1++) {
267 for (i2 = i1 + 1; i2 <= *n; i2++) {
269 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
270 CHECK_PAIRWISE_DELETION
271 if (DifferentBase(x[s1], x[s2])) Nd++;
279 #define COUNT_TS_TV1_TV2\
280 if (SameBase(x[s1], x[s2])) continue;\
282 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
286 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
289 #define COMPUTE_DIST_K81\
290 P = ((double) (Nd - Nv1 - Nv2)/L);\
291 Q = ((double) Nv1/L);\
292 R = ((double) Nv2/L);\
296 d[target] = -0.25*log(a1*a2*a3);\
298 a = (1/a1 + 1/a2)/2;\
299 b = (1/a1 + 1/a3)/2;\
300 c = (1/a2 + 1/a3)/2;\
301 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
304 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
305 int *variance, double *var)
307 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
308 double P, Q, R, a1, a2, a3, a, b, c;
313 for (i1 = 1; i1 < *n; i1++) {
314 for (i2 = i1 + 1; i2 <= *n; i2++) {
316 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
325 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
326 int *variance, double *var)
328 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
329 double P, Q, R, a1, a2, a3, a, b, c;
332 for (i1 = 1; i1 < *n; i1++) {
333 for (i2 = i1 + 1; i2 <= *n; i2++) {
334 Nd = Nv1 = Nv2 = L = 0;
335 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
336 CHECK_PAIRWISE_DELETION
345 #define PREPARE_BF_F84\
346 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
347 B = BF[0]*BF[2] + BF[1]*BF[3];\
348 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
350 #define COMPUTE_DIST_F84\
351 P = ((double) Ns/L);\
352 Q = ((double) (Nd - Ns)/L);\
353 d[target] = -2*A*log(1 - (P/(2*A) - (A - B)*Q/(2*A*C))) + 2*(A - B - C)*log(1 - Q/(2*C));\
358 a = t1/(t1 - t2 - t3);\
359 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
360 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/2;\
363 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
364 double *BF, int *variance, double *var)
366 int i1, i2, Nd, Ns, L, target, s1, s2;
367 double P, Q, A, B, C, a, b, t1, t2, t3;
373 for (i1 = 1; i1 < *n; i1++) {
374 for (i2 = i1 + 1; i2 <= *n; i2++) {
376 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
385 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
386 double *BF, int *variance, double *var)
388 int i1, i2, Nd, Ns, L, target, s1, s2;
389 double P, Q, A, B, C, a, b, t1, t2, t3;
394 for (i1 = 1; i1 < *n; i1++) {
395 for (i2 = i1 + 1; i2 <= *n; i2++) {
397 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
398 CHECK_PAIRWISE_DELETION
407 #define COMPUTE_DIST_T92\
408 P = ((double) Ns/L);\
409 Q = ((double) (Nd - Ns)/L);\
412 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
416 c3 = wg*(c1 - c2) + c2;\
417 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
420 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
421 double *BF, int *variance, double *var)
423 int i1, i2, Nd, Ns, L, target, s1, s2;
424 double P, Q, wg, a1, a2, c1, c2, c3;
427 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
430 for (i1 = 1; i1 < *n; i1++) {
431 for (i2 = i1 + 1; i2 <= *n; i2++) {
433 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
442 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
443 double *BF, int *variance, double *var)
445 int i1, i2, Nd, Ns, L, target, s1, s2;
446 double P, Q, wg, a1, a2, c1, c2, c3;
448 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
451 for (i1 = 1; i1 < *n; i1++) {
452 for (i2 = i1 + 1; i2 <= *n; i2++) {
454 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
455 CHECK_PAIRWISE_DELETION
464 /* returns 1 if one of the base is adenine and
465 the other one is guanine surely, 0 otherwise */
466 #define AdenineAndGuanine(a, b) (a | b) == 200
468 /* returns 1 if one of the base is cytosine and
469 the other one is thymine surely, 0 otherwise */
470 #define CytosineAndThymine(a, b) (a | b) == 56
472 #define PREPARE_BF_TN93\
475 k1 = 2 * BF[0] * BF[2] / gR;\
476 k2 = 2 * BF[1] * BF[3] / gY;\
477 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
479 #define COUNT_TS1_TS2_TV\
480 if (DifferentBase(x[s1], x[s2])) {\
482 if (AdenineAndGuanine(x[s1], x[s2])) {\
486 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
489 #define COMPUTE_DIST_TN93\
490 P1 = ((double) Ns1/L);\
491 P2 = ((double) Ns2/L);\
492 Q = ((double) (Nd - Ns1 - Ns2)/L);\
493 w1 = 1 - P1/k1 - Q/(2*gR);\
494 w2 = 1 - P2/k2 - Q/(2*gY);\
495 w3 = 1 - Q/(2*gR*gY);\
497 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
502 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
503 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
505 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
509 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
510 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
513 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
515 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
516 double *BF, int *variance, double *var,
517 int *gamma, double *alpha)
519 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
520 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
527 for (i1 = 1; i1 < *n; i1++) {
528 for (i2 = i1 + 1; i2 <= *n; i2++) {
530 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
539 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
540 double *BF, int *variance, double *var,
541 int *gamma, double *alpha)
543 int i1, i2, k, Nd, Ns1, Ns2, L, target, s1, s2;
544 double P1, P2, Q, A, B, C, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
549 for (i1 = 1; i1 < *n; i1++) {
550 for (i2 = i1 + 1; i2 <= *n; i2++) {
551 Nd = Ns1 = Ns2 = L = 0;
552 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
553 CHECK_PAIRWISE_DELETION
562 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
563 int *variance, double *var)
565 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
566 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
573 npair = *n * (*n - 1) / 2;
575 theta = (double*)R_alloc(*n, sizeof(double));
576 P = (double*)R_alloc(npair, sizeof(double));
577 Q = (double*)R_alloc(npair, sizeof(double));
578 tstvr = (double*)R_alloc(npair, sizeof(double));
580 /* get the proportion of GC (= theta) in each sequence */
581 for (i1 = 1; i1 <= *n; i1++) {
583 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
584 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
585 theta[i1 - 1] = ((double) GC / *s);
588 /* get the proportions of transitions and transversions,
589 and the estimates of their ratio for each pair */
591 for (i1 = 1; i1 < *n; i1++) {
592 for (i2 = i1 + 1; i2 <= *n; i2++) {
594 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
597 P[target] = ((double) Ns / *s);
598 Q[target] = ((double) (Nd - Ns) / *s);
599 A = log(1 - 2*Q[target]);
600 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
605 /* compute the mean alpha (ma) = mean Ts/Tv */
608 for (i1 = 0; i1 < npair; i1++)
609 /* some values of tstvr are -Inf if there is no
610 transversions observed: we exclude them */
611 if (R_FINITE(tstvr[i1])) {
617 /* compute the distance for each pair */
619 for (i1 = 1; i1 < *n; i1++) {
620 for (i2 = i1 + 1; i2 <= *n; i2++) {
622 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
623 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
624 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
626 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
632 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
633 int *variance, double *var)
635 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
636 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
644 npair = *n * (*n - 1) / 2;
646 theta = (double*)R_alloc(*n, sizeof(double));
647 L = (int*)R_alloc(npair, sizeof(int));
648 P = (double*)R_alloc(npair, sizeof(double));
649 Q = (double*)R_alloc(npair, sizeof(double));
650 tstvr = (double*)R_alloc(npair, sizeof(double));
652 /* get the proportion of GC (= theta) in each sequence */
653 for (i1 = 1; i1 <= *n; i1++) {
655 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
656 if (KnownBase(x[s1])) tl++;
658 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
660 theta[i1 - 1] = ((double) GC / tl);
663 /* get the proportions of transitions and transversions,
664 and the estimates of their ratio for each pair; we
665 also get the sample size for each pair in L */
667 for (i1 = 1; i1 < *n; i1++) {
668 for (i2 = i1 + 1; i2 <= *n; i2++) {
669 Nd = Ns = L[target] = 0;
670 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
671 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
675 P[target] = ((double) Ns/L[target]);
676 Q[target] = ((double) (Nd - Ns)/L[target]);
677 A = log(1 - 2*Q[target]);
678 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
683 /* compute the mean alpha (ma) = mean Ts/Tv */
686 for (i1 = 0; i1 < npair; i1++)
687 /* some values of tstvr are -Inf if there is no
688 transversions observed: we exclude them */
689 if (R_FINITE(tstvr[i1])) {
695 /* compute the distance for each pair */
697 for (i1 = 1; i1 < *n; i1++) {
698 for (i2 = i1 + 1; i2 <= *n; i2++) {
700 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
701 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
702 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
704 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
710 #define DO_CONTINGENCY_NUCLEOTIDES\
712 case 136 : m = 0; break;\
713 case 72 : m = 1; break;\
714 case 40 : m = 2; break;\
715 case 24 : m = 3; break;\
718 case 72 : m += 4; break;\
719 case 40 : m += 8; break;\
720 case 24 : m += 12; break;\
724 #define COMPUTE_DIST_LogDet\
725 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
726 d[target] = (-log(detFourByFour(Ftab))/4 - LN4)/4;\
728 /* For the inversion, we first make U an identity matrix */\
729 for (k = 1; k < 15; k++) U[k] = 0;\
730 U[0] = U[5] = U[10] = U[15] = 1;\
731 /* The matrix is not symmetric, so we use 'dgesv'. */\
732 /* This subroutine puts the result in U. */\
733 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
734 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
735 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
736 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
737 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
738 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
739 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
740 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
741 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
744 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
745 int *variance, double *var)
747 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
748 double Ftab[16], U[16];
753 for (i1 = 1; i1 < *n; i1++) {
754 for (i2 = i1 + 1; i2 <= *n; i2++) {
755 for (k = 0; k < 16; k++) Ntab[k] = 0;
756 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
757 DO_CONTINGENCY_NUCLEOTIDES
765 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
766 int *variance, double *var)
768 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
769 double Ftab[16], U[16];
772 for (i1 = 1; i1 < *n; i1++) {
773 for (i2 = i1 + 1; i2 <= *n; i2++) {
774 for (k = 0; k < 16; k++) Ntab[k] = 0;
776 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
777 CHECK_PAIRWISE_DELETION
778 DO_CONTINGENCY_NUCLEOTIDES
786 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
787 int *variance, double *var)
789 For the moment there is no need to check for pairwise deletions
790 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
791 In effect the pairwise deletion has possibly been done before.
792 The sequence length(s) are used only to compute the variances, which is
793 currently not available.
796 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4], ndim = 4, info, ipiv[16];
797 double P12[16], P21[16], U[16];
799 for (i1 = 1; i1 < *n; i1++) {
800 for (i2 = i1 + 1; i2 <= *n; i2++) {
801 for (k = 0; k < 16; k++) Ntab[k] = 0;
802 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
803 DO_CONTINGENCY_NUCLEOTIDES
806 /* get the rowwise sums of Ntab */
807 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
808 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
809 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
810 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
812 for (k = 0; k < 16; k++)
813 P12[k] = ((double) Ntab[k]);
815 /* scale each element of P12 by its rowwise sum */
816 for (k = 0; k < 4; k++)
817 for (kb = 0; kb < 16; kb += 4)
818 P12[k + kb] = P12[k + kb]/ROWsums[k];
820 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
822 /* compute the columnwise sums of Ntab: these
823 are the rowwise sums of its transpose */
824 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
825 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
826 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
827 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
829 /* transpose Ntab and store the result in P21 */
830 for (k = 0; k < 4; k++)
831 for (kb = 0; kb < 4; kb++)
832 P21[kb + 4*k] = Ntab[k + 4*kb];
835 for (k = 0; k < 4; k++)
836 for (kb = 0; kb < 16; kb += 4)
837 P21[k + kb] = P21[k + kb]/ROWsums[k];
839 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
844 #define COMPUTE_DIST_ParaLin\
845 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
846 d[target] = -log(detFourByFour(Ftab)/\
847 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
848 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
850 /* For the inversion, we first make U an identity matrix */\
851 for (k = 1; k < 15; k++) U[k] = 0;\
852 U[0] = U[5] = U[10] = U[15] = 1;\
853 /* The matrix is not symmetric, so we use 'dgesv'. */\
854 /* This subroutine puts the result in U. */\
855 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
856 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
857 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
858 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
859 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
860 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
861 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
862 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
863 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
864 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
865 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
866 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
867 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
870 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
871 int *variance, double *var)
873 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
874 double Ftab[16], U[16], *find[4];
878 for (k = 0; k < 4; k++)
879 find[k] = (double*)R_alloc(*n, sizeof(double));
881 for (i1 = 0; i1 < *n; i1++)
882 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
884 for (i1 = 0; i1 < *n; i1++) {
885 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
887 case 136 : find[0][i1]++; break;
888 case 40 : find[1][i1]++; break;
889 case 72 : find[2][i1]++; break;
890 case 24 : find[3][i1]++; break;
893 for (k = 0; k < 4; k++) find[k][i1] /= L;
897 for (i1 = 1; i1 < *n; i1++) {
898 for (i2 = i1 + 1; i2 <= *n; i2++) {
899 for (k = 0; k < 16; k++) Ntab[k] = 0;
900 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
901 DO_CONTINGENCY_NUCLEOTIDES
909 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
910 int *variance, double *var)
912 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
913 double Ftab[16], U[16], *find[4];
917 for (k = 0; k < 4; k++)
918 find[k] = (double*)R_alloc(*n, sizeof(double));
920 for (i1 = 0; i1 < *n; i1++)
921 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
923 for (i1 = 0; i1 < *n; i1++) {
925 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
926 if (KnownBase(x[s1])) {
929 case 136 : find[0][i1]++; break;
930 case 40 : find[1][i1]++; break;
931 case 72 : find[2][i1]++; break;
932 case 24 : find[3][i1]++; break;
936 for (k = 0; k < 4; k++) find[k][i1] /= L;
940 for (i1 = 1; i1 < *n; i1++) {
941 for (i2 = i1 + 1; i2 <= *n; i2++) {
943 for (k = 0; k < 16; k++) Ntab[k] = 0;
944 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
945 CHECK_PAIRWISE_DELETION
946 DO_CONTINGENCY_NUCLEOTIDES
954 void BaseProportion(unsigned char *x, int *n, double *BF, int *freq)
959 for (i = 0; i < *n; i++) {
960 if (KnownBase(x[i])) {
963 case 136 : BF[0]++; break;
964 case 40 : BF[1]++; break;
965 case 72 : BF[2]++; break;
966 case 24 : BF[3]++; break;
970 if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m;
973 void SegSites(unsigned char *x, int *n, int *s, int *seg)
978 for (j = 0; j < *s; j++) {
980 while (!KnownBase(x[i])) i++;
983 while (i < *n * (j + 1)) {
984 if (!KnownBase(x[i]) || x[i] == basis) i++;
993 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
997 for (j = 0; j < *s; j++) {
999 while (i < *n * (j + 1)) {
1000 if (KnownBase(x[i])) i++;
1009 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1010 double *BF, int *pairdel, int *variance, double *var,
1011 int *gamma, double *alpha)
1014 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1015 else distDNA_raw(x, n, s, d, 1); break;
1017 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1018 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1020 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1021 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1023 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1024 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1026 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1027 else distDNA_K81(x, n, s, d, variance, var); break;
1029 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1030 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1032 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1033 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1035 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1036 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1038 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1039 else distDNA_GG95(x, n, s, d, variance, var); break;
1041 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1042 else distDNA_LogDet(x, n, s, d, variance, var); break;
1044 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1046 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1047 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1048 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1049 else distDNA_raw(x, n, s, d, 0); break;