From: Heng Li Date: Tue, 21 Dec 2010 17:16:33 +0000 (+0000) Subject: * samtools-0.1.12-9 (r891) X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=f0b7a686f0f6581bc524353d7e783abbe9a94316;p=samtools.git * samtools-0.1.12-9 (r891) * allow to call SNPs from a subset of samples --- diff --git a/bamtk.c b/bamtk.c index 64ed4de..576ff7d 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.12-8 (r889)" +#define PACKAGE_VERSION "0.1.12-9 (r891)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bcftools/bcf.h b/bcftools/bcf.h index f87ac1e..84f0c2d 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -142,6 +142,8 @@ extern "C" { int bcf_gl2pl(bcf1_t *b); // if the site is an indel int bcf_is_indel(const bcf1_t *b); + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); + int bcf_subsam(int n_smpl, int *list, bcf1_t *b); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index e6a132b..fb70d81 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -178,3 +178,44 @@ int bcf_anno_max(bcf1_t *b) free(str.s); return 0; } + +bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) +{ + int i, ret, j; + khint_t k; + bcf_hdr_t *h; + khash_t(str2id) *hash; + kstring_t s; + s.l = s.m = 0; s.s = 0; + hash = kh_init(str2id); + for (i = 0; i < n; ++i) + k = kh_put(str2id, hash, samples[i], &ret); + for (i = j = 0; i < h0->n_smpl; ++i) { + if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) { + list[j++] = i; + kputs(h0->sns[i], &s); kputc('\0', &s); + } + } + if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + kh_destroy(str2id, hash); + h = calloc(1, sizeof(bcf_hdr_t)); + *h = *h0; + h->ns = 0; h->sns = 0; + h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); + h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); + h->l_smpl = s.l; h->sname = s.s; + bcf_hdr_sync(h); + return h; +} + +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted +{ + int i, j; + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + for (i = 0; i < n_smpl; ++i) + memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + } + b->n_smpl = n_smpl; + return 0; +} diff --git a/bcftools/call1.c b/bcftools/call1.c index 2308607..0991fce 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -29,8 +29,8 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_ANNO_MAX 16384 typedef struct { - int flag, prior_type, n1; - char *fn_list, *prior_file; + int flag, prior_type, n1, n_sub, *sublist; + char *fn_list, *prior_file, **subsam; double theta, pref, indel_frac; } viewconf_t; @@ -201,6 +201,30 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p return is_var; } +static char **read_samples(const char *fn, int *_n) +{ + gzFile fp; + kstream_t *ks; + kstring_t s; + int dret, n = 0, max = 0; + char **sam = 0; + s.l = s.m = 0; s.s = 0; + fp = gzopen(fn, "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + if (max == n) { + max = max? max<<1 : 4; + sam = realloc(sam, sizeof(void*)); + } + sam[n++] = strdup(s.s); + } + ks_destroy(ks); + gzclose(fp); + free(s.s); + *_n = n; + return sam; +} + static void write_header(bcf_hdr_t *h) { kstring_t str; @@ -251,7 +275,7 @@ int bcfview(int argc, char *argv[]) uint64_t n_processed = 0; viewconf_t vc; bcf_p1aux_t *p1 = 0; - bcf_hdr_t *h; + bcf_hdr_t *hin, *hout; int tid, begin, end; char moder[4], modew[4]; khash_t(set64) *hash = 0; @@ -259,7 +283,8 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IM")) >= 0) { + vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0; + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; @@ -280,6 +305,7 @@ int bcfview(int argc, char *argv[]) case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; + case 's': vc.subsam = read_samples(optarg, &vc.n_sub); break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -310,6 +336,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -p FLOAT variant if P(ref|D)n_smpl); + p1 = bcf_p1_init(hout->n_smpl); if (vc.prior_file) { if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); @@ -342,9 +373,9 @@ int bcfview(int argc, char *argv[]) } if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac } - if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); + if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, hin); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { - void *str2id = bcf_build_refhash(h); + void *str2id = bcf_build_refhash(hout); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { bcf_idx_t *idx; idx = bcf_idx_load(argv[optind]); @@ -360,8 +391,10 @@ int bcfview(int argc, char *argv[]) } } } - while (vcf_read(bp, h, b) > 0) { - int is_indel = bcf_is_indel(b); + while (vcf_read(bp, hin, b) > 0) { + int is_indel; + if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); + is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { int x; @@ -384,7 +417,7 @@ int bcfview(int argc, char *argv[]) } ++n_processed; if (vc.flag & VC_QCALL) { // output QCALL format; STOP here - bcf_2qcall(h, b); + bcf_2qcall(hout, b); continue; } if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b); @@ -397,7 +430,7 @@ int bcfview(int argc, char *argv[]) bcf_p1_dump_afs(p1); } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; - update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); + update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag); } if (vc.flag & VC_ADJLD) { // compute LD double f[4], r2; @@ -417,15 +450,21 @@ int bcfview(int argc, char *argv[]) b->fmt[0] = '\0'; b->l_str = b->fmt - b->str + 1; } else bcf_fix_gt(b); - vcf_write(bout, h, b); + vcf_write(bout, hout, b); } if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); - bcf_hdr_destroy(h); + if (hin != hout) bcf_hdr_destroy(hout); + bcf_hdr_destroy(hin); bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); if (hash) kh_destroy(set64, hash); if (vc.fn_list) free(vc.fn_list); + if (vc.n_sub) { + int i; + for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); + free(vc.subsam); free(vc.sublist); + } if (p1) bcf_p1_destroy(p1); return 0; }