-/* dist_dna.c 2011-06-23 */
+/* dist_dna.c 2012-11-28 */
-/* Copyright 2005-2011 Emmanuel Paradis
+/* Copyright 2005-2012 Emmanuel Paradis
/* This file is part of the R-package `ape'. */
/* See the file ../COPYING for licensing issues. */
-#include <R.h>
#include <R_ext/Lapack.h>
+#include "ape.h"
/* from R: print(log(4), d = 22) */
#define LN4 1.386294361119890572454
}\
if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++;
+void distDNA_indel(unsigned char *x, int *n, int *s, double *d)
+{
+ int i1, i2, s1, s2, target, N;
+
+ target = 0;
+ for (i1 = 1; i1 < *n; i1++) {
+ for (i2 = i1 + 1; i2 <= *n; i2++) {
+ N = 0;
+
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
+ if ((x[s1] ^ x[s2]) & 4) N++;
+
+ d[target] = ((double) N);
+ target++;
+ }
+ }
+}
+
+#define X(i, j) i - 1 + *n * (j - 1)
+
+void distDNA_indelblock(unsigned char *x, int *n, int *s, double *d)
+{
+ int i1, i2, s1, s2, target, N, start_block, end_block;
+
+ for (i1 = 1; i1 <= *n; i1++) {
+
+/* find a block of one or more '-' */
+
+ s1 = 1;
+ while (s1 < *s) {
+ if (x[X(i1, s1)] == 4) {
+ start_block = s1;
+ while (x[X(i1, s1)] == 4) s1++;
+ end_block = s1 - 1;
+
+/* then look if the same block is present in all the other sequences */
+
+ for (i2 = 1; i2 <= *n; i2++) {
+ if (i2 == i1) continue;
+
+ target = give_index(i1, i2, *n);
+
+ if (start_block > 1) {
+ if (x[X(i2, start_block - 1)] == 4) {
+ d[target]++;
+ continue;
+ }
+ }
+ if (end_block < *s) {
+ if (x[X(i2, end_block + 1)] == 4) {
+ d[target]++;
+ continue;
+ }
+ }
+ for (s2 = start_block; s2 <= end_block; s2++) {
+ if (x[X(i2, s2)] != 4) {
+ d[target]++;
+ continue;
+ }
+ }
+ }
+ s1 = end_block + 1;
+ }
+ s1++;
+ }
+ }
+}
+
+#undef X
+
void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel)
{
int i1, i2, s1, s2, target, Nd, Ns;
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue;
COUNT_TS_TV
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
if (DifferentBase(x[s1], x[s2])) Nd++;
if (scaled) d[target] = ((double) Nd / *s);
else d[target] = ((double) Nd);
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
if (DifferentBase(x[s1], x[s2])) Nd++;
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n)
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n)
if (DifferentBase(x[s1], x[s2])) Nd++;
COMPUTE_DIST_JC69
target++;
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
COUNT_TS_TV
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
if (DifferentBase(x[s1], x[s2])) Nd++;
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Nv1 = Nv2 = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
COUNT_TS_TV1_TV2
}
COMPUTE_DIST_K81
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Nv1 = Nv2 = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
COUNT_TS_TV1_TV2
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
COUNT_TS_TV
}
COMPUTE_DIST_F84
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
COUNT_TS_TV
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
COUNT_TS_TV
}
COMPUTE_DIST_T92
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
COUNT_TS_TV
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns1 = Ns2 = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
COUNT_TS1_TS2_TV
}
COMPUTE_DIST_TN93
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns1 = Ns2 = L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
COUNT_TS1_TS2_TV
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
COUNT_TS_TV
}
P[target] = ((double) Ns / *s);
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
Nd = Ns = L[target] = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++;
else continue;
COUNT_TS_TV
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
for (k = 0; k < 16; k++) Ntab[k] = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
DO_CONTINGENCY_NUCLEOTIDES
}
COMPUTE_DIST_LogDet
for (i2 = i1 + 1; i2 <= *n; i2++) {
for (k = 0; k < 16; k++) Ntab[k] = 0;
L = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
DO_CONTINGENCY_NUCLEOTIDES
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
for (k = 0; k < 16; k++) Ntab[k] = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
DO_CONTINGENCY_NUCLEOTIDES
}
for (i1 = 1; i1 < *n; i1++) {
for (i2 = i1 + 1; i2 <= *n; i2++) {
for (k = 0; k < 16; k++) Ntab[k] = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
DO_CONTINGENCY_NUCLEOTIDES
}
COMPUTE_DIST_ParaLin
for (i2 = i1 + 1; i2 <= *n; i2++) {
L = 0;
for (k = 0; k < 16; k++) Ntab[k] = 0;
- for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) {
+ for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) {
CHECK_PAIRWISE_DELETION
DO_CONTINGENCY_NUCLEOTIDES
}
}
}
-/* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */
-/* { */
-/* int i, m; */
-
-/* m = 0; */
-/* for (i = 0; i < *n; i++) { */
-/* if (KnownBase(x[i])) { */
-/* m++; */
-/* switch (x[i]) { */
-/* case 136 : BF[0]++; break; */
-/* case 40 : BF[1]++; break; */
-/* case 72 : BF[2]++; break; */
-/* case 24 : BF[3]++; break; */
-/* } */
-/* } */
-/* } */
-/* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */
-/* } */
-
+/* a hash table is much faster than switch (2012-01-10) */
void BaseProportion(unsigned char *x, int *n, double *BF)
{
- int i;
-
- for (i = 0; i < *n; i++) {
- switch (x[i]) {
- case 136 : BF[0]++; break;
- case 40 : BF[1]++; break;
- case 72 : BF[2]++; break;
- case 24 : BF[3]++; break;
- case 192 : BF[4]++; break;
- case 160 : BF[5]++; break;
- case 144 : BF[6]++; break;
- case 96 : BF[7]++; break;
- case 80 : BF[8]++; break;
- case 48 : BF[9]++; break;
- case 224 : BF[10]++; break;
- case 176 : BF[11]++; break;
- case 208 : BF[12]++; break;
- case 112 : BF[13]++; break;
- case 240 : BF[14]++; break;
- case 4 : BF[15]++; break;
- case 2 : BF[16]++; break;
- }
- }
+ int i;
+ double count[256];
+
+ memset(count, 0, 256*sizeof(double));
+
+ for (i = 0; i < *n; i++) count[x[i]]++;
+
+ BF[0] = count[136];
+ BF[1] = count[40];
+ BF[2] = count[72];
+ BF[3] = count[24];
+ BF[4] = count[192];
+ BF[5] = count[160];
+ BF[6] = count[144];
+ BF[7] = count[96];
+ BF[8] = count[80];
+ BF[9] = count[48];
+ BF[10] = count[224];
+ BF[11] = count[176];
+ BF[12] = count[208];
+ BF[13] = count[112];
+ BF[14] = count[240];
+ BF[15] = count[4];
+ BF[16] = count[2];
}
void SegSites(unsigned char *x, int *n, int *s, int *seg)
else distDNA_TsTv(x, n, s, d, 1, 0); break;
case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1);
else distDNA_TsTv(x, n, s, d, 0, 0); break;
-
+ case 16 : distDNA_indel(x, n, s, d); break;
+ case 17 : distDNA_indelblock(x, n, s, d); break;
}
}
+
+void where(unsigned char *x, unsigned char *pat, int *s, int *p,
+ int *ans, int *n)
+{
+ int i, j, k, ln;
+
+ ln = 0; /* local n */
+
+ for (i = 0; i <= *s - *p; i++) {
+ k = i; j = 0;
+ while (1) {
+ if (x[k] != pat[j]) break;
+ j++; k++;
+ if (j == *p) {
+ ans[ln++] = k - 1;
+ break;
+ }
+ }
+ }
+ *n = ln;
+}