From 94027d8bf194b5e62b98d8a2b240a3ab1c439b46 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Aug 2010 14:43:13 +0000 Subject: [PATCH] * improved kstring (added kstrtok) * removed the limit on the format string length in bcftools * use kstrtok to parse format which fixed a bug in the old code --- bcftools/bcf.c | 29 ++++++--------- bcftools/bcf.h | 14 +++++++- bcftools/vcfout.c | 1 + kstring.c | 89 ++++++++++++++++++++++++++++++++++------------- kstring.h | 32 ++++++++++++----- 5 files changed, 112 insertions(+), 53 deletions(-) diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 2cd6423..f6c26f2 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -100,12 +100,11 @@ int bcf_hdr_sync(bcf_hdr_t *b) return 0; } -#define char2int(s) (((int)s[0])<<8|s[1]) - int bcf_sync(int n_smpl, bcf1_t *b) { - char *p, *tmp[5], *s; + char *p, *tmp[5]; int i, n; + ks_tokaux_t aux; // set ref, alt, flt, info, fmt b->ref = b->alt = b->flt = b->info = b->fmt = 0; for (p = b->str, n = 0; p < b->str + b->l_str; ++p) @@ -127,23 +126,17 @@ int bcf_sync(int n_smpl, bcf1_t *b) memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); } b->n_gi = n; - for (p = s = b->fmt, n = 0; *p; ++p) { - if (*p == ':' || *(p+1) == 0) { - char *q = *p == ':'? p : p + 1; - if ((q - s) != 2) return -2; - b->gi[n].fmt = char2int(s); - s = q; - } - } + for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux)) + b->gi[n++].fmt = bcf_str2int(p, aux.p - p); // set gi[i].len for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == char2int("PL")) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; - } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { b->gi[i].len = 2; - } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; - } else if (b->gi[i].fmt == char2int("GL")) { + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { b->gi[i].len = 4; } b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); @@ -233,16 +226,16 @@ char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b) kputc('\t', &s); for (i = 0; i < b->n_gi; ++i) { if (i) kputc(':', &s); - if (b->gi[i].fmt == char2int("PL")) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { uint8_t *d = (uint8_t*)b->gi[i].data + j * x; int k; for (k = 0; k < x; ++k) { if (k > 0) kputc(',', &s); kputw(d[k], &s); } - } else if (b->gi[i].fmt == char2int("DP")) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], &s); - } else if (b->gi[i].fmt == char2int("GQ")) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { kputw(((uint8_t*)b->gi[i].data)[j], &s); } } diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 0923574..3c0ab2c 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -6,7 +6,8 @@ #include "bgzf.h" typedef struct { - int fmt, len; // len is the unit length + uint32_t fmt; + int len; // len is the unit length void *data; // derived info: fmt, len } bcf_ginfo_t; @@ -73,4 +74,15 @@ extern "C" { } #endif +static inline uint32_t bcf_str2int(const char *str, int l) +{ + int i; + uint32_t x = 0; + for (i = 0; i < l && i < 4; ++i) { + if (str[i] == 0) return x; + x = x<<8 | str[i]; + } + return x; +} + #endif diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c index 989e831..dfcb25a 100644 --- a/bcftools/vcfout.c +++ b/bcftools/vcfout.c @@ -175,6 +175,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); fprintf(stderr, "Options: -c SNP calling\n"); + fprintf(stderr, " -b output BCF instead of VCF\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); fprintf(stderr, " -L discard the PL genotype field\n"); fprintf(stderr, " -v output potential variant sites only\n"); diff --git a/kstring.c b/kstring.c index e0203fa..a09180b 100644 --- a/kstring.c +++ b/kstring.c @@ -24,6 +24,24 @@ int ksprintf(kstring_t *s, const char *fmt, ...) return l; } +char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +{ + const char *p, *start; + if (sep) { // set up the table + if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) + aux->tab[*p/64] |= 1ull<<(*p%64); + } + if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; + else if (aux->tab[0]&1) return 0; + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p/64]>>(*p%64)) break; + aux->p = p; // end of token + if (*p == 0) aux->tab[0] |= 1; // no more tokens + return (char*)start; +} + // s MUST BE a null terminated string; l = strlen(s) int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) { @@ -66,11 +84,13 @@ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) * Boyer-Moore search * **********************/ +typedef unsigned char ubyte_t; + // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html -int *ksBM_prep(const uint8_t *pat, int m) +static int *ksBM_prep(const ubyte_t *pat, int m) { int i, *suff, *prep, *bmGs, *bmBc; - prep = calloc(m + 256, 1); + prep = calloc(m + 256, sizeof(int)); bmGs = prep; bmBc = prep + m; { // preBmBc() for (i = 0; i < 256; ++i) bmBc[i] = m; @@ -107,39 +127,49 @@ int *ksBM_prep(const uint8_t *pat, int m) return prep; } -int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) { - int i, j, *prep, *bmGs, *bmBc; - int *matches = 0, mm = 0, nm = 0; - prep = _prep? _prep : ksBM_prep(pat, m); + int i, j, *prep = 0, *bmGs, *bmBc; + const ubyte_t *str, *pat; + str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; + prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (_prep && *_prep == 0) *_prep = prep; bmGs = prep; bmBc = prep + m; j = 0; while (j <= n - m) { for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); - if (i < 0) { - if (nm == mm) { - mm = mm? mm<<1 : 1; - matches = realloc(matches, mm * sizeof(int)); - } - matches[nm++] = j; - j += bmGs[0]; - } else { + if (i >= 0) { int max = bmBc[str[i+j]] - m + 1 + i; if (max < bmGs[i]) max = bmGs[i]; j += max; - } + } else return (void*)(str + j); } - *n_matches = nm; if (_prep == 0) free(prep); - return matches; + return 0; } +char *kstrstr(const char *str, const char *pat, int **_prep) +{ + return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); +} + +char *kstrnstr(const char *str, const char *pat, int n, int **_prep) +{ + return (char*)kmemmem(str, n, pat, strlen(pat), _prep); +} + +/*********************** + * The main() function * + ***********************/ + #ifdef KSTRING_MAIN #include int main() { kstring_t *s; int *fields, n, i; + ks_tokaux_t aux; + char *p; s = (kstring_t*)calloc(1, sizeof(kstring_t)); // test ksprintf() ksprintf(s, " abcdefg: %d ", 100); @@ -148,17 +178,26 @@ int main() fields = ksplit(s, 0, &n); for (i = 0; i < n; ++i) printf("field[%d] = '%s'\n", i, s->s + fields[i]); - free(s); + // test kstrtok() + s->l = 0; + for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { + kputsn(p, aux.p - p, s); + kputc('\n', s); + } + printf("%s", s->s); + // free + free(s->s); free(s); free(fields); { - static char *str = "abcdefgcdg"; + static char *str = "abcdefgcdgcagtcakcdcd"; static char *pat = "cd"; - int n, *matches; - matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); - printf("%d: \n", n); - for (i = 0; i < n; ++i) - printf("- %d\n", matches[i]); - free(matches); + char *ret, *s = str; + int *prep = 0; + while ((ret = kstrstr(s, pat, &prep)) != 0) { + printf("match: %s\n", ret); + s = ret + prep[0]; + } + free(prep); } return 0; } diff --git a/kstring.h b/kstring.h index 925117a..c46a62b 100644 --- a/kstring.h +++ b/kstring.h @@ -17,17 +17,31 @@ typedef struct __kstring_t { } kstring_t; #endif -int ksprintf(kstring_t *s, const char *fmt, ...); -int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); +typedef struct { + uint64_t tab[4]; + const char *p; // end of the current token +} ks_tokaux_t; -// calculate the auxiliary array, allocated by calloc() -int *ksBM_prep(const uint8_t *pat, int m); +#ifdef __cplusplus +extern "C" { +#endif + + int ksprintf(kstring_t *s, const char *fmt, ...); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); -/* Search pat in str and returned the list of matches. The size of the - * list is returned as n_matches. _prep is the array returned by - * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ -int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); +#ifdef __cplusplus +} +#endif + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { @@ -35,7 +49,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s) kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strncpy(s->s + s->l, p, l); + memcpy(s->s + s->l, p, l); s->l += l; s->s[s->l] = 0; return l; -- 2.39.2