1 /* dist_dna.c 2012-02-13 */
3 /* Copyright 2005-2012 Emmanuel Paradis
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
9 #include <R_ext/Lapack.h>
11 /* from R: print(log(4), d = 22) */
12 #define LN4 1.386294361119890572454
14 /* returns 8 if the base is known surely, 0 otherwise */
15 #define KnownBase(a) (a & 8)
17 /* returns 1 if the base is adenine surely, 0 otherwise */
18 #define IsAdenine(a) a == 136
20 /* returns 1 if the base is guanine surely, 0 otherwise */
21 #define IsGuanine(a) a == 72
23 /* returns 1 if the base is cytosine surely, 0 otherwise */
24 #define IsCytosine(a) a == 40
26 /* returns 1 if the base is thymine surely, 0 otherwise */
27 #define IsThymine(a) a == 24
29 /* returns 1 if the base is a purine surely, 0 otherwise */
30 #define IsPurine(a) a > 63
32 /* returns 1 if the base is a pyrimidine surely, 0 otherwise */
33 #define IsPyrimidine(a) a < 64
35 /* returns 1 if both bases are different surely, 0 otherwise */
36 #define DifferentBase(a, b) (a & b) < 16
38 /* returns 1 if both bases are the same surely, 0 otherwise */
39 #define SameBase(a, b) KnownBase(a) && a == b
41 /* computes directly the determinant of a 4x4 matrix */
42 double detFourByFour(double *x)
44 double det, a33a44, a34a43, a34a42, a32a44, a32a43, a33a42, a34a41, a31a44, a31a43, a33a41, a31a42, a32a41;
46 a33a44 = x[10]*x[15]; a34a43 = x[14]*x[11];
47 a34a42 = x[14]*x[7]; a32a44 = x[6]*x[15];
48 a32a43 = x[6]*x[11]; a33a42 = x[10]*x[7];
49 a34a41 = x[14]*x[3]; a31a44 = x[2]*x[15];
50 a31a43 = x[2]*x[11]; a33a41 = x[10]*x[3];
51 a31a42 = x[2]*x[7]; a32a41 = x[6]*x[3];
53 det = x[0]*x[5]*(a33a44 - a34a43) + x[0]*x[9]*(a34a42 - a32a44) +
54 x[0]*x[13]*(a32a43 - a33a42) + x[4]*x[9]*(a31a44 - a34a41) +
55 x[4]*x[13]*(a33a41 - a31a43) + x[4]*x[1]*(a34a43 - a33a44) +
56 x[8]*x[13]*(a31a42 - a32a41) + x[8]*x[1]*(a32a44 - a34a42) +
57 x[8]*x[5]*(a34a41 - a31a44) + x[12]*x[1]*(a33a42 - a32a43) +
58 x[12]*x[5]*(a31a43 - a33a41) + x[12]*x[9]*(a32a41 - a31a42);
63 #define CHECK_PAIRWISE_DELETION\
64 if (KnownBase(x[s1]) && KnownBase(x[s2])) L++;\
68 if (SameBase(x[s1], x[s2])) continue;\
70 if (IsPurine(x[s1]) && IsPurine(x[s2])) {\
74 if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
79 void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
81 int i1, i2, s1, s2, target, Nd, Ns;
84 for (i1 = 1; i1 < *n; i1++) {
85 for (i2 = i1 + 1; i2 <= *n; i2++) {
87 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
88 if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
91 if (Ts) d[target] = ((double) Ns); /* output number of transitions */
92 else d[target] = ((double) Nd - Ns); /* output number of transversions */
98 void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled)
100 int i1, i2, s1, s2, target, Nd;
103 for (i1 = 1; i1 < *n; i1++) {
104 for (i2 = i1 + 1; i2 <= *n; i2++) {
106 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
107 if (DifferentBase(x[s1], x[s2])) Nd++;
108 if (scaled) d[target] = ((double) Nd / *s);
109 else d[target] = ((double) Nd);
115 void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled)
117 int i1, i2, s1, s2, target, Nd, L;
120 for (i1 = 1; i1 < *n; i1++) {
121 for (i2 = i1 + 1; i2 <= *n; i2++) {
123 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
124 CHECK_PAIRWISE_DELETION
125 if (DifferentBase(x[s1], x[s2])) Nd++;
127 if (scaled) d[target] = ((double) Nd/L);
128 else d[target] = ((double) Nd);
134 #define COMPUTE_DIST_JC69\
135 p = ((double) Nd/L);\
137 d[target] = 0.75 * *alpha*(pow(1 - 4*p/3, -1/ *alpha) - 1);\
138 else d[target] = -0.75*log(1 - 4*p/3);\
140 if (*gamma) var[target] = p*(1 - p)/(pow(1 - 4*p/3, -2/(*alpha + 1)) * L);\
141 else var[target] = p*(1 - p)/(pow(1 - 4*p/3, 2)*L);\
144 void distDNA_JC69(unsigned char *x, int *n, int *s, double *d,
145 int *variance, double *var, int *gamma, double *alpha)
147 int i1, i2, s1, s2, target, Nd, L;
153 for (i1 = 1; i1 < *n; i1++) {
154 for (i2 = i1 + 1; i2 <= *n; i2++) {
156 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
157 if (DifferentBase(x[s1], x[s2])) Nd++;
164 void distDNA_JC69_pairdel(unsigned char *x, int *n, int *s, double *d,
165 int *variance, double *var, int *gamma, double *alpha)
167 int i1, i2, s1, s2, target, Nd, L;
171 for (i1 = 1; i1 < *n; i1++) {
172 for (i2 = i1 + 1; i2 <= *n; i2++) {
174 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
175 CHECK_PAIRWISE_DELETION
176 if (DifferentBase(x[s1], x[s2])) Nd++;
184 #define COMPUTE_DIST_K80\
185 P = ((double) Ns/L);\
186 Q = ((double) (Nd - Ns)/L);\
191 d[target] = *alpha * (pow(a1, b) + 0.5*pow(a2, b) - 1.5)/2;\
193 else d[target] = -0.5 * log(a1 * sqrt(a2));\
196 b = -(1 / *alpha + 1);\
205 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
208 void distDNA_K80(unsigned char *x, int *n, int *s, double *d,
209 int *variance, double *var, int *gamma, double *alpha)
211 int i1, i2, s1, s2, target, Nd, Ns, L;
212 double P, Q, a1, a2, b, c1, c2, c3;
217 for (i1 = 1; i1 < *n; i1++) {
218 for (i2 = i1 + 1; i2 <= *n; i2++) {
220 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
229 void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d,
230 int *variance, double *var, int *gamma, double *alpha)
232 int i1, i2, s1, s2, target, Nd, Ns, L;
233 double P, Q, a1, a2, b, c1, c2, c3;
236 for (i1 = 1; i1 < *n; i1++) {
237 for (i2 = i1 + 1; i2 <= *n; i2++) {
239 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
240 CHECK_PAIRWISE_DELETION
249 #define COMPUTE_DIST_F81\
250 p = ((double) Nd/L);\
251 if (*gamma) d[target] = E * *alpha * (pow(1 - p/E, -1/ *alpha) - 1);\
252 else d[target] = -E*log(1 - p/E);\
254 if (*gamma) var[target] = p*(1 - p)/(pow(1 - p/E, -2/(*alpha + 1)) * L);\
255 else var[target] = p*(1 - p)/(pow(1 - p/E, 2)*L);\
258 void distDNA_F81(unsigned char *x, int *n, int *s, double *d, double *BF,
259 int *variance, double *var, int *gamma, double *alpha)
261 int i1, i2, s1, s2, target, Nd, L;
265 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
268 for (i1 = 1; i1 < *n; i1++) {
269 for (i2 = i1 + 1; i2 <= *n; i2++) {
271 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
272 if (DifferentBase(x[s1], x[s2])) Nd++;
279 void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF,
280 int *variance, double *var, int *gamma, double *alpha)
282 int i1, i2, s1, s2, target, Nd, L;
285 E = 1 - BF[0]*BF[0] - BF[1]*BF[1] - BF[2]*BF[2] - BF[3]*BF[3];
288 for (i1 = 1; i1 < *n; i1++) {
289 for (i2 = i1 + 1; i2 <= *n; i2++) {
291 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
292 CHECK_PAIRWISE_DELETION
293 if (DifferentBase(x[s1], x[s2])) Nd++;
301 #define COUNT_TS_TV1_TV2\
302 if (SameBase(x[s1], x[s2])) continue;\
304 if ((x[s1] | x[s2]) == 152 || (x[s1] | x[s2]) == 104) {\
308 if ((x[s1] | x[s2]) == 168 || (x[s1] | x[s2]) == 88) Nv2++;
311 #define COMPUTE_DIST_K81\
312 P = ((double) (Nd - Nv1 - Nv2)/L);\
313 Q = ((double) Nv1/L);\
314 R = ((double) Nv2/L);\
318 d[target] = -0.25*log(a1*a2*a3);\
320 a = (1/a1 + 1/a2)/2;\
321 b = (1/a1 + 1/a3)/2;\
322 c = (1/a2 + 1/a3)/2;\
323 var[target] = (a*a*P + b*b*Q + c*c*R - pow(a*P + b*Q + c*R, 2))/2;\
326 void distDNA_K81(unsigned char *x, int *n, int *s, double *d,
327 int *variance, double *var)
329 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
330 double P, Q, R, a1, a2, a3, a, b, c;
335 for (i1 = 1; i1 < *n; i1++) {
336 for (i2 = i1 + 1; i2 <= *n; i2++) {
338 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
347 void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d,
348 int *variance, double *var)
350 int i1, i2, Nd, Nv1, Nv2, L, s1, s2, target;
351 double P, Q, R, a1, a2, a3, a, b, c;
354 for (i1 = 1; i1 < *n; i1++) {
355 for (i2 = i1 + 1; i2 <= *n; i2++) {
356 Nd = Nv1 = Nv2 = L = 0;
357 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
358 CHECK_PAIRWISE_DELETION
367 #define PREPARE_BF_F84\
368 A = (BF[0]*BF[2])/(BF[0] + BF[2]) + (BF[1]*BF[3])/(BF[1] + BF[3]);\
369 B = BF[0]*BF[2] + BF[1]*BF[3];\
370 C = (BF[0] + BF[2])*(BF[1] + BF[3]);
372 #define COMPUTE_DIST_F84\
373 P = ((double) Ns/L);\
374 Q = ((double) (Nd - Ns)/L);\
375 d[target] = -2*A*log(1 - P/(2*A) - (A - B)*Q/(2*A*C)) + 2*(A - B - C)*log(1 - Q/(2*C));\
380 a = t1/(t1 - t2 - t3);\
381 b = A*(A - B)/(t1 - t2 - t3) - (A - B - C)/(C - Q/2);\
382 var[target] = (a*a*P + b*b*Q - pow(a*P + b*Q, 2))/L;\
385 void distDNA_F84(unsigned char *x, int *n, int *s, double *d,
386 double *BF, int *variance, double *var)
388 int i1, i2, Nd, Ns, L, target, s1, s2;
389 double P, Q, A, B, C, a, b, t1, t2, t3;
395 for (i1 = 1; i1 < *n; i1++) {
396 for (i2 = i1 + 1; i2 <= *n; i2++) {
398 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
407 void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d,
408 double *BF, int *variance, double *var)
410 int i1, i2, Nd, Ns, L, target, s1, s2;
411 double P, Q, A, B, C, a, b, t1, t2, t3;
416 for (i1 = 1; i1 < *n; i1++) {
417 for (i2 = i1 + 1; i2 <= *n; i2++) {
419 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
420 CHECK_PAIRWISE_DELETION
429 #define COMPUTE_DIST_T92\
430 P = ((double) Ns/L);\
431 Q = ((double) (Nd - Ns)/L);\
434 d[target] = -wg*log(a1) - 0.5*(1 - wg)*log(a2);\
438 c3 = wg*(c1 - c2) + c2;\
439 var[target] = (c1*c1*P + c3*c3*Q - pow(c1*P + c3*Q, 2))/L;\
442 void distDNA_T92(unsigned char *x, int *n, int *s, double *d,
443 double *BF, int *variance, double *var)
445 int i1, i2, Nd, Ns, L, target, s1, s2;
446 double P, Q, wg, a1, a2, c1, c2, c3;
449 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
452 for (i1 = 1; i1 < *n; i1++) {
453 for (i2 = i1 + 1; i2 <= *n; i2++) {
455 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
464 void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d,
465 double *BF, int *variance, double *var)
467 int i1, i2, Nd, Ns, L, target, s1, s2;
468 double P, Q, wg, a1, a2, c1, c2, c3;
470 wg = 2 * (BF[1] + BF[2]) * (1 - (BF[1] + BF[2]));
473 for (i1 = 1; i1 < *n; i1++) {
474 for (i2 = i1 + 1; i2 <= *n; i2++) {
476 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
477 CHECK_PAIRWISE_DELETION
486 /* returns 1 if one of the base is adenine and
487 the other one is guanine surely, 0 otherwise */
488 #define AdenineAndGuanine(a, b) (a | b) == 200
490 /* returns 1 if one of the base is cytosine and
491 the other one is thymine surely, 0 otherwise */
492 #define CytosineAndThymine(a, b) (a | b) == 56
494 #define PREPARE_BF_TN93\
497 k1 = 2 * BF[0] * BF[2] / gR;\
498 k2 = 2 * BF[1] * BF[3] / gY;\
499 k3 = 2 * (gR * gY - BF[0]*BF[2]*gY/gR - BF[1]*BF[3]*gR/gY);
501 #define COUNT_TS1_TS2_TV\
502 if (DifferentBase(x[s1], x[s2])) {\
504 if (AdenineAndGuanine(x[s1], x[s2])) {\
508 if (CytosineAndThymine(x[s1], x[s2])) Ns2++;\
511 #define COMPUTE_DIST_TN93\
512 P1 = ((double) Ns1/L);\
513 P2 = ((double) Ns2/L);\
514 Q = ((double) (Nd - Ns1 - Ns2)/L);\
515 w1 = 1 - P1/k1 - Q/(2*gR);\
516 w2 = 1 - P2/k2 - Q/(2*gY);\
517 w3 = 1 - Q/(2*gR*gY);\
519 k4 = 2*(BF[0]*BF[2] + BF[1]*BF[3] + gR*gY);\
524 c4 = k1*c1/(2*gR) + k2*c2/(2*gY) + k3*c3/(2*gR*gY);\
525 d[target] = *alpha * (k1*pow(w1, b) + k2*pow(w2, b) + k3*pow(w3, b) - k4);\
527 k4 = 2*((BF[0]*BF[0] + BF[2]*BF[2])/(2*gR*gR) + (BF[2]*BF[2] + BF[3]*BF[3])/(2*gY*gY));\
531 c4 = k1 * c1/(2 * gR) + k2 * c2/(2 * gY) + k4 * c3;\
532 d[target] = -k1*log(w1) - k2*log(w2) - k3*log(w3);\
535 var[target] = (c1*c1*P1 + c2*c2*P2 + c4*c4*Q - pow(c1*P1 + c2*P2 + c4*Q, 2))/L;
537 void distDNA_TN93(unsigned char *x, int *n, int *s, double *d,
538 double *BF, int *variance, double *var,
539 int *gamma, double *alpha)
541 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
542 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
549 for (i1 = 1; i1 < *n; i1++) {
550 for (i2 = i1 + 1; i2 <= *n; i2++) {
552 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
561 void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d,
562 double *BF, int *variance, double *var,
563 int *gamma, double *alpha)
565 int i1, i2, Nd, Ns1, Ns2, L, target, s1, s2;
566 double P1, P2, Q, gR, gY, k1, k2, k3, k4, w1, w2, w3, c1, c2, c3, c4, b;
571 for (i1 = 1; i1 < *n; i1++) {
572 for (i2 = i1 + 1; i2 <= *n; i2++) {
573 Nd = Ns1 = Ns2 = L = 0;
574 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
575 CHECK_PAIRWISE_DELETION
584 void distDNA_GG95(unsigned char *x, int *n, int *s, double *d,
585 int *variance, double *var)
587 int i1, i2, s1, s2, target, GC, Nd, Ns, tl, npair;
588 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
595 npair = *n * (*n - 1) / 2;
597 theta = (double*)R_alloc(*n, sizeof(double));
598 P = (double*)R_alloc(npair, sizeof(double));
599 Q = (double*)R_alloc(npair, sizeof(double));
600 tstvr = (double*)R_alloc(npair, sizeof(double));
602 /* get the proportion of GC (= theta) in each sequence */
603 for (i1 = 1; i1 <= *n; i1++) {
605 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n)
606 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
607 theta[i1 - 1] = ((double) GC / *s);
610 /* get the proportions of transitions and transversions,
611 and the estimates of their ratio for each pair */
613 for (i1 = 1; i1 < *n; i1++) {
614 for (i2 = i1 + 1; i2 <= *n; i2++) {
616 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
619 P[target] = ((double) Ns / *s);
620 Q[target] = ((double) (Nd - Ns) / *s);
621 A = log(1 - 2*Q[target]);
622 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
627 /* compute the mean alpha (ma) = mean Ts/Tv */
630 for (i1 = 0; i1 < npair; i1++)
631 /* some values of tstvr are -Inf if there is no
632 transversions observed: we exclude them */
633 if (R_FINITE(tstvr[i1])) {
639 /* compute the distance for each pair */
641 for (i1 = 1; i1 < *n; i1++) {
642 for (i2 = i1 + 1; i2 <= *n; i2++) {
644 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
645 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
646 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
648 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A * *s);
654 void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d,
655 int *variance, double *var)
657 int i1, i2, s1, s2, target, *L, length, GC, Nd, Ns, tl, npair;
658 double *theta, gcprop, *P, pp, *Q, qq, *tstvr, svr, A, sum, ma /* mean alpha */, K1, K2;
666 npair = *n * (*n - 1) / 2;
668 theta = (double*)R_alloc(*n, sizeof(double));
669 L = (int*)R_alloc(npair, sizeof(int));
670 P = (double*)R_alloc(npair, sizeof(double));
671 Q = (double*)R_alloc(npair, sizeof(double));
672 tstvr = (double*)R_alloc(npair, sizeof(double));
674 /* get the proportion of GC (= theta) in each sequence */
675 for (i1 = 1; i1 <= *n; i1++) {
677 for (s1 = i1 - 1; s1 < i1 + *n*(*s - 1); s1 += *n) {
678 if (KnownBase(x[s1])) tl++;
680 if (IsCytosine(x[s1]) || IsGuanine(x[s1])) GC += 1;
682 theta[i1 - 1] = ((double) GC / tl);
685 /* get the proportions of transitions and transversions,
686 and the estimates of their ratio for each pair; we
687 also get the sample size for each pair in L */
689 for (i1 = 1; i1 < *n; i1++) {
690 for (i2 = i1 + 1; i2 <= *n; i2++) {
691 Nd = Ns = L[target] = 0;
692 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
693 if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
697 P[target] = ((double) Ns/L[target]);
698 Q[target] = ((double) (Nd - Ns)/L[target]);
699 A = log(1 - 2*Q[target]);
700 tstvr[target] = 2*(log(1 - 2*P[target] - Q[target]) - 0.5*A)/A;
705 /* compute the mean alpha (ma) = mean Ts/Tv */
708 for (i1 = 0; i1 < npair; i1++)
709 /* some values of tstvr are -Inf if there is no
710 transversions observed: we exclude them */
711 if (R_FINITE(tstvr[i1])) {
717 /* compute the distance for each pair */
719 for (i1 = 1; i1 < *n; i1++) {
720 for (i2 = i1 + 1; i2 <= *n; i2++) {
722 K1 = 1 + ma*(theta[i1 - 1]*(1 - theta[i1 - 1]) + theta[i2 - 1]*(1 - theta[i2 - 1]));
723 K2 = ma*pow(theta[i1 - 1] - theta[i2 - 1], 2)/(ma + 1);
724 d[target] = -0.5*K1*log(A) + K2*(1 - pow(A, 0.25*(ma + 1)));
726 var[target] = pow(K1 + K2*0.5*(ma + 1)*pow(A, 0.25*(ma + 1)), 2)*Q[target]*(1 - Q[target])/(A*A*L[target]);
732 #define DO_CONTINGENCY_NUCLEOTIDES\
734 case 136 : m = 0; break;\
735 case 72 : m = 1; break;\
736 case 40 : m = 2; break;\
737 case 24 : m = 3; break;\
740 case 72 : m += 4; break;\
741 case 40 : m += 8; break;\
742 case 24 : m += 12; break;\
746 #define COMPUTE_DIST_LogDet\
747 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
748 d[target] = -log(detFourByFour(Ftab))/4 - LN4;\
750 /* For the inversion, we first make U an identity matrix */\
751 for (k = 1; k < 15; k++) U[k] = 0;\
752 U[0] = U[5] = U[10] = U[15] = 1;\
753 /* The matrix is not symmetric, so we use 'dgesv'. */\
754 /* This subroutine puts the result in U. */\
755 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
756 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
757 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
758 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
759 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
760 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
761 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
762 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
763 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] - 16)/(L*16);\
766 void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d,
767 int *variance, double *var)
769 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
770 double Ftab[16], U[16];
775 for (i1 = 1; i1 < *n; i1++) {
776 for (i2 = i1 + 1; i2 <= *n; i2++) {
777 for (k = 0; k < 16; k++) Ntab[k] = 0;
778 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
779 DO_CONTINGENCY_NUCLEOTIDES
787 void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d,
788 int *variance, double *var)
790 int i1, i2, k, m, s1, s2, target, L, Ntab[16], ndim = 4, info, ipiv[16];
791 double Ftab[16], U[16];
794 for (i1 = 1; i1 < *n; i1++) {
795 for (i2 = i1 + 1; i2 <= *n; i2++) {
796 for (k = 0; k < 16; k++) Ntab[k] = 0;
798 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
799 CHECK_PAIRWISE_DELETION
800 DO_CONTINGENCY_NUCLEOTIDES
808 void distDNA_BH87(unsigned char *x, int *n, int *s, double *d,
809 int *variance, double *var)
811 For the moment there is no need to check for pairwise deletions
812 since DO_CONTINGENCY_NUCLEOTIDES considers only the known nucleotides.
813 In effect the pairwise deletion has possibly been done before.
814 The sequence length(s) are used only to compute the variances, which is
815 currently not available.
818 int i1, i2, k, kb, s1, s2, m, Ntab[16], ROWsums[4];
819 double P12[16], P21[16];
821 for (i1 = 1; i1 < *n; i1++) {
822 for (i2 = i1 + 1; i2 <= *n; i2++) {
823 for (k = 0; k < 16; k++) Ntab[k] = 0;
824 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
825 DO_CONTINGENCY_NUCLEOTIDES
828 /* get the rowwise sums of Ntab */
829 ROWsums[0] = Ntab[0] + Ntab[4] + Ntab[8] + Ntab[12];
830 ROWsums[1] = Ntab[1] + Ntab[5] + Ntab[9] + Ntab[13];
831 ROWsums[2] = Ntab[2] + Ntab[6] + Ntab[10] + Ntab[14];
832 ROWsums[3] = Ntab[3] + Ntab[7] + Ntab[11] + Ntab[15];
834 for (k = 0; k < 16; k++)
835 P12[k] = ((double) Ntab[k]);
837 /* scale each element of P12 by its rowwise sum */
838 for (k = 0; k < 4; k++)
839 for (kb = 0; kb < 16; kb += 4)
840 P12[k + kb] = P12[k + kb]/ROWsums[k];
842 d[*n*(i2 - 1) + i1 - 1] = -log(detFourByFour(P12))/4;
844 /* compute the columnwise sums of Ntab: these
845 are the rowwise sums of its transpose */
846 ROWsums[0] = Ntab[0] + Ntab[1] + Ntab[2] + Ntab[3];
847 ROWsums[1] = Ntab[4] + Ntab[5] + Ntab[6] + Ntab[7];
848 ROWsums[2] = Ntab[8] + Ntab[9] + Ntab[10] + Ntab[11];
849 ROWsums[3] = Ntab[12] + Ntab[13] + Ntab[14] + Ntab[15];
851 /* transpose Ntab and store the result in P21 */
852 for (k = 0; k < 4; k++)
853 for (kb = 0; kb < 4; kb++)
854 P21[kb + 4*k] = Ntab[k + 4*kb];
857 for (k = 0; k < 4; k++)
858 for (kb = 0; kb < 16; kb += 4)
859 P21[k + kb] = P21[k + kb]/ROWsums[k];
861 d[*n*(i1 - 1) + i2 - 1] = -log(detFourByFour(P21))/4;
866 #define COMPUTE_DIST_ParaLin\
867 for (k = 0; k < 16; k++) Ftab[k] = ((double) Ntab[k]/L);\
868 d[target] = -log(detFourByFour(Ftab)/\
869 sqrt(find[0][i1 - 1]*find[1][i1 - 1]*find[2][i1 - 1]*find[3][i1 - 1]*\
870 find[0][i2 - 1]*find[1][i2 - 1]*find[2][i2 - 1]*find[3][i2 - 1]))/4;\
872 /* For the inversion, we first make U an identity matrix */\
873 for (k = 1; k < 15; k++) U[k] = 0;\
874 U[0] = U[5] = U[10] = U[15] = 1;\
875 /* The matrix is not symmetric, so we use 'dgesv'. */\
876 /* This subroutine puts the result in U. */\
877 F77_CALL(dgesv)(&ndim, &ndim, Ftab, &ndim, ipiv, U, &ndim, &info);\
878 var[target] = (U[0]*U[0]*Ftab[0] + U[1]*U[1]*Ftab[4] +\
879 U[2]*U[2]*Ftab[8] + U[3]*U[3]*Ftab[12] +\
880 U[4]*U[4]*Ftab[1] + U[5]*U[5]*Ftab[5] +\
881 U[6]*U[6]*Ftab[9] + U[7]*U[7]*Ftab[13] +\
882 U[8]*U[8]*Ftab[2] + U[9]*U[9]*Ftab[6] +\
883 U[10]*U[10]*Ftab[10] + U[11]*U[11]*Ftab[14] +\
884 U[12]*U[12]*Ftab[3] + U[13]*U[13]*Ftab[7] +\
885 U[14]*U[14]*Ftab[11] + U[15]*U[15]*Ftab[15] -\
886 4*(1/sqrt(find[0][i1 - 1]*find[0][i2 - 1]) +\
887 1/sqrt(find[1][i1 - 1]*find[1][i2 - 1]) +\
888 1/sqrt(find[2][i1 - 1]*find[2][i2 - 1]) +\
889 1/sqrt(find[3][i1 - 1]*find[3][i2 - 1])))/(L*16);\
892 void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d,
893 int *variance, double *var)
895 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
896 double Ftab[16], U[16], *find[4];
900 for (k = 0; k < 4; k++)
901 find[k] = (double*)R_alloc(*n, sizeof(double));
903 for (i1 = 0; i1 < *n; i1++)
904 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
906 for (i1 = 0; i1 < *n; i1++) {
907 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
909 case 136 : find[0][i1]++; break;
910 case 40 : find[1][i1]++; break;
911 case 72 : find[2][i1]++; break;
912 case 24 : find[3][i1]++; break;
915 for (k = 0; k < 4; k++) find[k][i1] /= L;
919 for (i1 = 1; i1 < *n; i1++) {
920 for (i2 = i1 + 1; i2 <= *n; i2++) {
921 for (k = 0; k < 16; k++) Ntab[k] = 0;
922 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
923 DO_CONTINGENCY_NUCLEOTIDES
931 void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d,
932 int *variance, double *var)
934 int i1, i2, k, s1, s2, m, target, L, Ntab[16], ndim = 4, info, ipiv[16];
935 double Ftab[16], U[16], *find[4];
939 for (k = 0; k < 4; k++)
940 find[k] = (double*)R_alloc(*n, sizeof(double));
942 for (i1 = 0; i1 < *n; i1++)
943 for (k = 0; k < 4; k++) find[k][i1] = 0.0;
945 for (i1 = 0; i1 < *n; i1++) {
947 for (s1 = i1; s1 < i1 + *n*(*s - 1) + 1; s1+= *n) {
948 if (KnownBase(x[s1])) {
951 case 136 : find[0][i1]++; break;
952 case 40 : find[1][i1]++; break;
953 case 72 : find[2][i1]++; break;
954 case 24 : find[3][i1]++; break;
958 for (k = 0; k < 4; k++) find[k][i1] /= L;
962 for (i1 = 1; i1 < *n; i1++) {
963 for (i2 = i1 + 1; i2 <= *n; i2++) {
965 for (k = 0; k < 16; k++) Ntab[k] = 0;
966 for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
967 CHECK_PAIRWISE_DELETION
968 DO_CONTINGENCY_NUCLEOTIDES
976 /* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */
981 /* for (i = 0; i < *n; i++) { */
982 /* if (KnownBase(x[i])) { */
984 /* switch (x[i]) { */
985 /* case 136 : BF[0]++; break; */
986 /* case 40 : BF[1]++; break; */
987 /* case 72 : BF[2]++; break; */
988 /* case 24 : BF[3]++; break; */
992 /* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */
995 /* void BaseProportion(unsigned char *x, int *n, double *BF) */
999 /* for (i = 0; i < *n; i++) { */
1000 /* switch (x[i]) { */
1001 /* case 136 : BF[0]++; break; */
1002 /* case 40 : BF[1]++; break; */
1003 /* case 72 : BF[2]++; break; */
1004 /* case 24 : BF[3]++; break; */
1005 /* case 192 : BF[4]++; break; */
1006 /* case 160 : BF[5]++; break; */
1007 /* case 144 : BF[6]++; break; */
1008 /* case 96 : BF[7]++; break; */
1009 /* case 80 : BF[8]++; break; */
1010 /* case 48 : BF[9]++; break; */
1011 /* case 224 : BF[10]++; break; */
1012 /* case 176 : BF[11]++; break; */
1013 /* case 208 : BF[12]++; break; */
1014 /* case 112 : BF[13]++; break; */
1015 /* case 240 : BF[14]++; break; */
1016 /* case 4 : BF[15]++; break; */
1017 /* case 2 : BF[16]++; break; */
1022 /* a hash table is much faster than switch (2012-01-10) */
1023 void BaseProportion(unsigned char *x, int *n, double *BF)
1028 memset(count, 0, 256*sizeof(double));
1030 for (i = 0; i < *n; i++) count[x[i]]++;
1042 BF[10] = count[224];
1043 BF[11] = count[176];
1044 BF[12] = count[208];
1045 BF[13] = count[112];
1046 BF[14] = count[240];
1051 void SegSites(unsigned char *x, int *n, int *s, int *seg)
1054 unsigned char basis;
1056 for (j = 0; j < *s; j++) {
1058 while (!KnownBase(x[i])) i++;
1061 while (i < *n * (j + 1)) {
1062 if (!KnownBase(x[i]) || x[i] == basis) i++;
1071 void GlobalDeletionDNA(unsigned char *x, int *n, int *s, int *keep)
1075 for (j = 0; j < *s; j++) {
1077 while (i < *n * (j + 1)) {
1078 if (KnownBase(x[i])) i++;
1087 void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d,
1088 double *BF, int *pairdel, int *variance, double *var,
1089 int *gamma, double *alpha)
1092 case 1 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 1);
1093 else distDNA_raw(x, n, s, d, 1); break;
1094 case 2 : if (pairdel) distDNA_JC69_pairdel(x, n, s, d, variance, var, gamma, alpha);
1095 else distDNA_JC69(x, n, s, d, variance, var, gamma, alpha); break;
1096 case 3 : if (pairdel) distDNA_K80_pairdel(x, n, s, d, variance, var, gamma, alpha);
1097 else distDNA_K80(x, n, s, d, variance, var, gamma, alpha); break;
1098 case 4 : if (pairdel) distDNA_F81_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1099 else distDNA_F81(x, n, s, d, BF, variance, var, gamma, alpha); break;
1100 case 5 : if (pairdel) distDNA_K81_pairdel(x, n, s, d, variance, var);
1101 else distDNA_K81(x, n, s, d, variance, var); break;
1102 case 6 : if (pairdel) distDNA_F84_pairdel(x, n, s, d, BF, variance, var);
1103 else distDNA_F84(x, n, s, d, BF, variance, var); break;
1104 case 7 : if (pairdel) distDNA_T92_pairdel(x, n, s, d, BF, variance, var);
1105 else distDNA_T92(x, n, s, d, BF, variance, var); break;
1106 case 8 : if (pairdel) distDNA_TN93_pairdel(x, n, s, d, BF, variance, var, gamma, alpha);
1107 else distDNA_TN93(x, n, s, d, BF, variance, var, gamma, alpha); break;
1108 case 9 : if (pairdel) distDNA_GG95_pairdel(x, n, s, d, variance, var);
1109 else distDNA_GG95(x, n, s, d, variance, var); break;
1110 case 10 : if (pairdel) distDNA_LogDet_pairdel(x, n, s, d, variance, var);
1111 else distDNA_LogDet(x, n, s, d, variance, var); break;
1112 case 11 : distDNA_BH87(x, n, s, d, variance, var); break;
1113 case 12 : if (pairdel) distDNA_ParaLin_pairdel(x, n, s, d, variance, var);
1114 else distDNA_ParaLin(x, n, s, d, variance, var); break;
1115 case 13 : if (pairdel) distDNA_raw_pairdel(x, n, s, d, 0);
1116 else distDNA_raw(x, n, s, d, 0); break;
1117 case 14 : if (pairdel) distDNA_TsTv(x, n, s, d, 1, 1);
1118 else distDNA_TsTv(x, n, s, d, 1, 0); break;
1119 case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
1120 else distDNA_TsTv(x, n, s, d, 0, 0); break;
1121 case 16 : distDNA_indel(x, n, s, d); break;
1122 case 17 : distDNA_indelblock(x, n, s, d); break;