From: Heng Li Date: Fri, 11 Feb 2011 03:59:47 +0000 (+0000) Subject: finished VCF->BCF conversion X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=06021e30abb66d5631a0cd5daadb8f45878e90a3;p=samtools.git finished VCF->BCF conversion --- diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 6932f35..f545a91 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -129,6 +129,8 @@ extern "C" { int vcf_close(bcf_t *bp); // read the VCF/BCF header bcf_hdr_t *vcf_hdr_read(bcf_t *bp); + // read the sequence dictionary from a separate file; required for VCF->BCF conversion + int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); // write the VCF header diff --git a/bcftools/call1.c b/bcftools/call1.c index c9df3c0..286c014 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -30,7 +30,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) typedef struct { int flag, prior_type, n1, n_sub, *sublist; - char *fn_list, *prior_file, **subsam; + char *fn_list, *prior_file, **subsam, *fn_dict; double theta, pref, indel_frac; } viewconf_t; @@ -286,10 +286,11 @@ int bcfview(int argc, char *argv[]) memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:")) >= 0) { + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; + case 'D': vc.fn_dict = strdup(optarg); break; case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; case 'A': vc.flag |= VC_KEEPALT; break; @@ -332,6 +333,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -Q output the QCALL likelihood format\n"); fprintf(stderr, " -L calculate LD for adjacent sites\n"); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); @@ -343,6 +345,10 @@ int bcfview(int argc, char *argv[]) return 1; } + if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) { + fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__); + return 1; + } b = calloc(1, sizeof(bcf1_t)); blast = calloc(1, sizeof(bcf1_t)); strcpy(moder, "r"); @@ -352,6 +358,8 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_UNCOMP) strcat(modew, "u"); bp = vcf_open(argv[optind], moder); hin = hout = vcf_hdr_read(bp); + if (vc.fn_dict && (vc.flag & VC_VCFIN)) + vcf_dictread(bp, hin, vc.fn_dict); bout = vcf_open("-", modew); if (!(vc.flag & VC_QCALL)) { if (vc.n_sub) { @@ -462,6 +470,7 @@ int bcfview(int argc, char *argv[]) vcf_close(bp); vcf_close(bout); if (hash) kh_destroy(set64, hash); if (vc.fn_list) free(vc.fn_list); + if (vc.fn_dict) free(vc.fn_dict); if (vc.n_sub) { int i; for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); diff --git a/bcftools/vcf.c b/bcftools/vcf.c index b3832e7..6d3abaa 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -72,6 +72,33 @@ bcf_t *vcf_open(const char *fn, const char *mode) return bp; } +int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) +{ + vcf_t *v; + gzFile fp; + kstream_t *ks; + kstring_t s, rn; + int dret; + if (bp == 0) return -1; + if (!bp->is_vcf) return 0; + s.l = s.m = 0; s.s = 0; + rn.m = rn.l = h->l_nm; rn.s = h->name; + v = (vcf_t*)bp->v; + fp = gzopen(fn, "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + bcf_str2id_add(v->refhash, strdup(s.s)); + kputs(s.s, &rn); kputc('\0', &rn); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + h->l_nm = rn.l; h->name = rn.s; + bcf_hdr_sync(h); + free(s.s); + return 0; +} + int vcf_close(bcf_t *bp) { vcf_t *v; @@ -84,7 +111,7 @@ int vcf_close(bcf_t *bp) } if (v->fpout) fclose(v->fpout); free(v->line.s); - bcf_str2id_destroy(v->refhash); + bcf_str2id_thorough_destroy(v->refhash); free(v); free(bp); return 0; @@ -93,13 +120,12 @@ int vcf_close(bcf_t *bp) int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) { vcf_t *v = (vcf_t*)bp->v; - int i, has_ref = 0, has_ver = 0; + int i, has_ver = 0; if (!bp->is_vcf) return bcf_hdr_write(bp, h); if (h->l_txt > 0) { if (strstr(h->txt, "##fileformat=")) has_ver = 1; if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); fwrite(h->txt, 1, h->l_txt - 1, v->fpout); - if (strstr(h->txt, "##SQ=")) has_ref = 1; } if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); @@ -138,7 +164,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) if (k == 0) { // ref int tid = bcf_str2id(v->refhash, p); if (tid < 0) { - tid = bcf_str2id_add(v->refhash, p); + tid = bcf_str2id_add(v->refhash, strdup(p)); kputs(p, &rn); kputc('\0', &rn); sync = 1; }