X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fdist_dna.c;h=c087212dc1b7477435a1fcf6b69630c6811a3466;hb=646d8be3497d656d95331e2c7f6bef5d2ff86b2c;hp=210479e37b2ba29c623e9379d7dfffe621ba24db;hpb=6ee9e0a4e1e6bbc09187382bfdef57fafe3844c7;p=ape.git diff --git a/src/dist_dna.c b/src/dist_dna.c index 210479e..c087212 100644 --- a/src/dist_dna.c +++ b/src/dist_dna.c @@ -1,12 +1,12 @@ -/* dist_dna.c 2011-06-23 */ +/* dist_dna.c 2012-11-28 */ -/* Copyright 2005-2011 Emmanuel Paradis +/* Copyright 2005-2012 Emmanuel Paradis /* This file is part of the R-package `ape'. */ /* See the file ../COPYING for licensing issues. */ -#include #include +#include "ape.h" /* from R: print(log(4), d = 22) */ #define LN4 1.386294361119890572454 @@ -73,6 +73,76 @@ double detFourByFour(double *x) }\ if (IsPyrimidine(x[s1]) && IsPyrimidine(x[s2])) Ns++; +void distDNA_indel(unsigned char *x, int *n, int *s, double *d) +{ + int i1, i2, s1, s2, target, N; + + target = 0; + for (i1 = 1; i1 < *n; i1++) { + for (i2 = i1 + 1; i2 <= *n; i2++) { + N = 0; + + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) + if ((x[s1] ^ x[s2]) & 4) N++; + + d[target] = ((double) N); + target++; + } + } +} + +#define X(i, j) i - 1 + *n * (j - 1) + +void distDNA_indelblock(unsigned char *x, int *n, int *s, double *d) +{ + int i1, i2, s1, s2, target, N, start_block, end_block; + + for (i1 = 1; i1 <= *n; i1++) { + +/* find a block of one or more '-' */ + + s1 = 1; + while (s1 < *s) { + if (x[X(i1, s1)] == 4) { + start_block = s1; + while (x[X(i1, s1)] == 4) s1++; + end_block = s1 - 1; + +/* then look if the same block is present in all the other sequences */ + + for (i2 = 1; i2 <= *n; i2++) { + if (i2 == i1) continue; + + target = give_index(i1, i2, *n); + + if (start_block > 1) { + if (x[X(i2, start_block - 1)] == 4) { + d[target]++; + continue; + } + } + if (end_block < *s) { + if (x[X(i2, end_block + 1)] == 4) { + d[target]++; + continue; + } + } + for (s2 = start_block; s2 <= end_block; s2++) { + if (x[X(i2, s2)] != 4) { + d[target]++; + continue; + } + } + } + s1 = end_block + 1; + } + s1++; + } + } +} + +#undef X + void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int pairdel) { int i1, i2, s1, s2, target, Nd, Ns; @@ -81,7 +151,7 @@ void distDNA_TsTv(unsigned char *x, int *n, int *s, double *d, int Ts, int paird for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { if (pairdel && !(KnownBase(x[s1]) && KnownBase(x[s2]))) continue; COUNT_TS_TV } @@ -100,7 +170,7 @@ void distDNA_raw(unsigned char *x, int *n, int *s, double *d, int scaled) for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) if (DifferentBase(x[s1], x[s2])) Nd++; if (scaled) d[target] = ((double) Nd / *s); else d[target] = ((double) Nd); @@ -117,7 +187,7 @@ void distDNA_raw_pairdel(unsigned char *x, int *n, int *s, double *d, int scaled for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION if (DifferentBase(x[s1], x[s2])) Nd++; } @@ -150,7 +220,7 @@ void distDNA_JC69(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) if (DifferentBase(x[s1], x[s2])) Nd++; COMPUTE_DIST_JC69 target++; @@ -233,7 +303,7 @@ void distDNA_K80_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION COUNT_TS_TV } @@ -285,7 +355,7 @@ void distDNA_F81_pairdel(unsigned char *x, int *n, int *s, double *d, double *BF for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION if (DifferentBase(x[s1], x[s2])) Nd++; } @@ -332,7 +402,7 @@ void distDNA_K81(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Nv1 = Nv2 = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { COUNT_TS_TV1_TV2 } COMPUTE_DIST_K81 @@ -351,7 +421,7 @@ void distDNA_K81_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Nv1 = Nv2 = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION COUNT_TS_TV1_TV2 } @@ -392,7 +462,7 @@ void distDNA_F84(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { COUNT_TS_TV } COMPUTE_DIST_F84 @@ -413,7 +483,7 @@ void distDNA_F84_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION COUNT_TS_TV } @@ -449,7 +519,7 @@ void distDNA_T92(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { COUNT_TS_TV } COMPUTE_DIST_T92 @@ -470,7 +540,7 @@ void distDNA_T92_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION COUNT_TS_TV } @@ -546,7 +616,7 @@ void distDNA_TN93(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns1 = Ns2 = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { COUNT_TS1_TS2_TV } COMPUTE_DIST_TN93 @@ -568,7 +638,7 @@ void distDNA_TN93_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns1 = Ns2 = L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION COUNT_TS1_TS2_TV } @@ -610,7 +680,7 @@ void distDNA_GG95(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { COUNT_TS_TV } P[target] = ((double) Ns / *s); @@ -686,7 +756,7 @@ void distDNA_GG95_pairdel(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { Nd = Ns = L[target] = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { if (KnownBase(x[s1]) && KnownBase(x[s2])) L[target]++; else continue; COUNT_TS_TV @@ -772,7 +842,7 @@ void distDNA_LogDet(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { for (k = 0; k < 16; k++) Ntab[k] = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { DO_CONTINGENCY_NUCLEOTIDES } COMPUTE_DIST_LogDet @@ -792,7 +862,7 @@ void distDNA_LogDet_pairdel(unsigned char *x, int *n, int *s, double *d, for (i2 = i1 + 1; i2 <= *n; i2++) { for (k = 0; k < 16; k++) Ntab[k] = 0; L = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION DO_CONTINGENCY_NUCLEOTIDES } @@ -818,7 +888,7 @@ void distDNA_BH87(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { for (k = 0; k < 16; k++) Ntab[k] = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { DO_CONTINGENCY_NUCLEOTIDES } @@ -916,7 +986,7 @@ void distDNA_ParaLin(unsigned char *x, int *n, int *s, double *d, for (i1 = 1; i1 < *n; i1++) { for (i2 = i1 + 1; i2 <= *n; i2++) { for (k = 0; k < 16; k++) Ntab[k] = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { DO_CONTINGENCY_NUCLEOTIDES } COMPUTE_DIST_ParaLin @@ -960,7 +1030,7 @@ void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d, for (i2 = i1 + 1; i2 <= *n; i2++) { L = 0; for (k = 0; k < 16; k++) Ntab[k] = 0; - for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1+= *n, s2 += *n) { + for (s1 = i1 - 1, s2 = i2 - 1; s1 < i1 + *n*(*s - 1); s1 += *n, s2 += *n) { CHECK_PAIRWISE_DELETION DO_CONTINGENCY_NUCLEOTIDES } @@ -970,50 +1040,33 @@ void distDNA_ParaLin_pairdel(unsigned char *x, int *n, int *s, double *d, } } -/* void BaseProportion(unsigned char *x, int *n, double *BF, int *freq) */ -/* { */ -/* int i, m; */ - -/* m = 0; */ -/* for (i = 0; i < *n; i++) { */ -/* if (KnownBase(x[i])) { */ -/* m++; */ -/* switch (x[i]) { */ -/* case 136 : BF[0]++; break; */ -/* case 40 : BF[1]++; break; */ -/* case 72 : BF[2]++; break; */ -/* case 24 : BF[3]++; break; */ -/* } */ -/* } */ -/* } */ -/* if (! *freq) for (i = 0; i < 4; i++) BF[i] /= m; */ -/* } */ - +/* a hash table is much faster than switch (2012-01-10) */ void BaseProportion(unsigned char *x, int *n, double *BF) { - int i; - - for (i = 0; i < *n; i++) { - switch (x[i]) { - case 136 : BF[0]++; break; - case 40 : BF[1]++; break; - case 72 : BF[2]++; break; - case 24 : BF[3]++; break; - case 192 : BF[4]++; break; - case 160 : BF[5]++; break; - case 144 : BF[6]++; break; - case 96 : BF[7]++; break; - case 80 : BF[8]++; break; - case 48 : BF[9]++; break; - case 224 : BF[10]++; break; - case 176 : BF[11]++; break; - case 208 : BF[12]++; break; - case 112 : BF[13]++; break; - case 240 : BF[14]++; break; - case 4 : BF[15]++; break; - case 2 : BF[16]++; break; - } - } + int i; + double count[256]; + + memset(count, 0, 256*sizeof(double)); + + for (i = 0; i < *n; i++) count[x[i]]++; + + BF[0] = count[136]; + BF[1] = count[40]; + BF[2] = count[72]; + BF[3] = count[24]; + BF[4] = count[192]; + BF[5] = count[160]; + BF[6] = count[144]; + BF[7] = count[96]; + BF[8] = count[80]; + BF[9] = count[48]; + BF[10] = count[224]; + BF[11] = count[176]; + BF[12] = count[208]; + BF[13] = count[112]; + BF[14] = count[240]; + BF[15] = count[4]; + BF[16] = count[2]; } void SegSites(unsigned char *x, int *n, int *s, int *seg) @@ -1086,6 +1139,28 @@ void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d, else distDNA_TsTv(x, n, s, d, 1, 0); break; case 15 : if (pairdel) distDNA_TsTv(x, n, s, d, 0, 1); else distDNA_TsTv(x, n, s, d, 0, 0); break; - + case 16 : distDNA_indel(x, n, s, d); break; + case 17 : distDNA_indelblock(x, n, s, d); break; } } + +void where(unsigned char *x, unsigned char *pat, int *s, int *p, + int *ans, int *n) +{ + int i, j, k, ln; + + ln = 0; /* local n */ + + for (i = 0; i <= *s - *p; i++) { + k = i; j = 0; + while (1) { + if (x[k] != pat[j]) break; + j++; k++; + if (j == *p) { + ans[ln++] = k - 1; + break; + } + } + } + *n = ln; +}