typedef struct {
int flag, prior_type, n1, n_sub, *sublist;
- char *fn_list, *prior_file, **subsam;
+ char *fn_list, *prior_file, **subsam, *fn_dict;
double theta, pref, indel_frac;
} viewconf_t;
memset(&vc, 0, sizeof(viewconf_t));
vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0;
- while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:")) >= 0) {
+ while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
+ case 'D': vc.fn_dict = strdup(optarg); break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
case 'G': vc.flag |= VC_NO_GENO; break;
case 'A': vc.flag |= VC_KEEPALT; break;
fprintf(stderr, " -Q output the QCALL likelihood format\n");
fprintf(stderr, " -L calculate LD for adjacent sites\n");
fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
return 1;
}
+ if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) {
+ fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
+ return 1;
+ }
b = calloc(1, sizeof(bcf1_t));
blast = calloc(1, sizeof(bcf1_t));
strcpy(moder, "r");
if (vc.flag & VC_UNCOMP) strcat(modew, "u");
bp = vcf_open(argv[optind], moder);
hin = hout = vcf_hdr_read(bp);
+ if (vc.fn_dict && (vc.flag & VC_VCFIN))
+ vcf_dictread(bp, hin, vc.fn_dict);
bout = vcf_open("-", modew);
if (!(vc.flag & VC_QCALL)) {
if (vc.n_sub) {
vcf_close(bp); vcf_close(bout);
if (hash) kh_destroy(set64, hash);
if (vc.fn_list) free(vc.fn_list);
+ if (vc.fn_dict) free(vc.fn_dict);
if (vc.n_sub) {
int i;
for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
return bp;
}
+int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
+{
+ vcf_t *v;
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s, rn;
+ int dret;
+ if (bp == 0) return -1;
+ if (!bp->is_vcf) return 0;
+ s.l = s.m = 0; s.s = 0;
+ rn.m = rn.l = h->l_nm; rn.s = h->name;
+ v = (vcf_t*)bp->v;
+ fp = gzopen(fn, "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ bcf_str2id_add(v->refhash, strdup(s.s));
+ kputs(s.s, &rn); kputc('\0', &rn);
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ h->l_nm = rn.l; h->name = rn.s;
+ bcf_hdr_sync(h);
+ free(s.s);
+ return 0;
+}
+
int vcf_close(bcf_t *bp)
{
vcf_t *v;
}
if (v->fpout) fclose(v->fpout);
free(v->line.s);
- bcf_str2id_destroy(v->refhash);
+ bcf_str2id_thorough_destroy(v->refhash);
free(v);
free(bp);
return 0;
int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
{
vcf_t *v = (vcf_t*)bp->v;
- int i, has_ref = 0, has_ver = 0;
+ int i, has_ver = 0;
if (!bp->is_vcf) return bcf_hdr_write(bp, h);
if (h->l_txt > 0) {
if (strstr(h->txt, "##fileformat=")) has_ver = 1;
if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
- if (strstr(h->txt, "##SQ=")) has_ref = 1;
}
if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
if (k == 0) { // ref
int tid = bcf_str2id(v->refhash, p);
if (tid < 0) {
- tid = bcf_str2id_add(v->refhash, p);
+ tid = bcf_str2id_add(v->refhash, strdup(p));
kputs(p, &rn); kputc('\0', &rn);
sync = 1;
}