X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bcftools%2Fbcfutils.c;h=fec06ba2131df14d0a2236c012d696ad233647ca;hb=80658e4d152b53bd55927c432ceece7702ab49d7;hp=e6a132b8d1a11714f1734cf754fe40bd48afa23f;hpb=5f5ed2477797cbd8b0c3cdd16d8abbaa2319d300;p=samtools.git diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index e6a132b..fec06ba 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -5,6 +5,7 @@ #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) +// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... void *bcf_build_refhash(bcf_hdr_t *h) { khash_t(str2id) *hash; @@ -29,6 +30,16 @@ void bcf_str2id_destroy(void *_hash) if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. } +void bcf_str2id_thorough_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); + kh_destroy(str2id, hash); +} + int bcf_str2id(void *_hash, const char *str) { khash_t(str2id) *hash = (khash_t(str2id)*)_hash; @@ -53,8 +64,9 @@ int bcf_str2id_add(void *_hash, const char *str) int bcf_shrink_alt(bcf1_t *b, int n) { char *p; - int i, j, k, *z, n_smpl = b->n_smpl; + int i, j, k, n_smpl = b->n_smpl; if (b->n_alleles <= n) return -1; + // update ALT if (n > 1) { for (p = b->alt, k = 1; *p; ++p) if (*p == ',' && ++k == n) break; @@ -63,10 +75,7 @@ int bcf_shrink_alt(bcf1_t *b, int n) ++p; memmove(p, b->flt, b->str + b->l_str - b->flt); b->l_str -= b->flt - p; - z = alloca(sizeof(int) / 2 * n * (n+1)); - for (i = k = 0; i < n; ++i) - for (j = 0; j < n - i; ++j) - z[k++] = i * b->n_alleles + j; + // update PL for (i = 0; i < b->n_gi; ++i) { bcf_ginfo_t *g = b->gi + i; if (g->fmt == bcf_str2int("PL", 2)) { @@ -75,7 +84,7 @@ int bcf_shrink_alt(bcf1_t *b, int n) g->len = n * (n + 1) / 2; for (l = k = 0; l < n_smpl; ++l) { uint8_t *dl = d + l * x; - for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]]; + for (j = 0; j < g->len; ++j) d[k++] = dl[j]; } } // FIXME: to add GL } @@ -133,6 +142,54 @@ int bcf_fix_gt(bcf1_t *b) return 0; } +int bcf_fix_pl(bcf1_t *b) +{ + int i; + uint32_t tmp; + uint8_t *PL, *swap; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + static void *locate_field(const bcf1_t *b, const char *fmt, int l) { int i; @@ -147,7 +204,8 @@ int bcf_anno_max(bcf1_t *b) { int k, max_gq, max_sp, n_het; kstring_t str; - uint8_t *gt, *gq, *sp; + uint8_t *gt, *gq; + int32_t *sp; max_gq = max_sp = n_het = 0; gt = locate_field(b, "GT", 2); if (gt == 0) return -1; @@ -178,3 +236,76 @@ int bcf_anno_max(bcf1_t *b) free(str.s); return 0; } + +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + +bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) +{ + int i, ret, j; + khint_t k; + bcf_hdr_t *h; + khash_t(str2id) *hash; + kstring_t s; + s.l = s.m = 0; s.s = 0; + hash = kh_init(str2id); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); + } + } + if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + kh_destroy(str2id, hash); + h = calloc(1, sizeof(bcf_hdr_t)); + *h = *h0; + h->ns = 0; h->sns = 0; + h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); + h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); + h->l_smpl = s.l; h->sname = s.s; + bcf_hdr_sync(h); + return h; +} + +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) +{ + int i, j; + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < n_smpl; ++i) + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; + } + b->n_smpl = n_smpl; + return 0; +}