X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bcftools%2Fcall1.c;h=1eacd4f731f1e35866a024d024ffe24cf4d8b49b;hb=dda27af89b70c52e075f6531d56c0bbccbc7246d;hp=75483e921a9ee10c45bfe6e4acf8e2ad3261f42c;hpb=082c52f3cb5517db50987bf1dc43aef845c45fd8;p=samtools.git diff --git a/bcftools/call1.c b/bcftools/call1.c index 75483e9..1eacd4f 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -13,16 +13,20 @@ KHASH_SET_INIT_INT64(set64) #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) -#define VC_NO_PL 1 #define VC_NO_GENO 2 #define VC_BCFOUT 4 #define VC_CALL 8 #define VC_VARONLY 16 #define VC_VCFIN 32 #define VC_UNCOMP 64 +#define VC_HWE 128 +#define VC_KEEPALT 256 +#define VC_ACGT_ONLY 512 +#define VC_QCALL 1024 +#define VC_CALL_GT 2048 typedef struct { - int flag, prior_type; + int flag, prior_type, n1; char *fn_list, *prior_file; double theta, pref; } viewconf_t; @@ -129,7 +133,7 @@ static int test16(bcf1_t *b, anno16_t *a) return test16_core(anno, a); } -static void rm_info(int n_smpl, bcf1_t *b, const char *key) +static void rm_info(bcf1_t *b, const char *key) { char *p, *q; if ((p = strstr(b->info, key)) == 0) return; @@ -137,7 +141,7 @@ static void rm_info(int n_smpl, bcf1_t *b, const char *key) if (p > b->info && *(p-1) == ';') --p; memmove(p, q, b->l_str - (q - b->str)); b->l_str -= q - p; - bcf_sync(n_smpl, b); + bcf_sync(b); } static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) @@ -147,34 +151,51 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref; anno16_t a; - p_hwe = test_hwe(pr->g); + p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated test16(b, &a); - rm_info(n_smpl, b, "I16="); + rm_info(b, "I16="); memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); - if (is_var) { - kputs(b->alt, &s); - } - kputc('\0', &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); - if (a.is_tested) ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); - if (p_hwe <= .2) ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); + if (a.is_tested) { + if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); + ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); + } + if (pr->g[0] >= 0. && p_hwe <= .2) + ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = r < 1e-100? 99 : -3.434 * log(r); + b->qual = r < 1e-100? 99 : -4.343 * log(r); if (b->qual > 99) b->qual = 99; - bcf_sync(n_smpl, b); + bcf_sync(b); + if (!is_var) bcf_shrink_alt(b, 1); + else if (!(flag&VC_KEEPALT)) + bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1); + if (is_var && (flag&VC_CALL_GT)) { // call individual genotype + int i, x, old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + for (i = 0; i < b->n_smpl; ++i) { + x = bcf_p1_call_gt(pa, pr->f_em, i); + ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; + ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; + } + } return is_var; } int bcfview(int argc, char *argv[]) { + extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); bcf_t *bp, *bout = 0; bcf1_t *b; int c; @@ -188,19 +209,24 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = -1; vc.theta = 1e-3; vc.pref = 0.9; - while ((c = getopt(argc, argv, "l:cGvLbSuP:t:p:")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:Qg")) >= 0) { switch (c) { + case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; + case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; - case 'L': vc.flag |= VC_NO_PL; break; + case 'A': vc.flag |= VC_KEEPALT; break; case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; - case 'v': vc.flag |= VC_VARONLY; break; + case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; + case 'H': vc.flag |= VC_HWE; break; + case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; + case 'Q': vc.flag |= VC_QCALL; break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -213,12 +239,17 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); fprintf(stderr, "Options: -c SNP calling\n"); + fprintf(stderr, " -v output potential variant sites only (force -c)\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -u uncompressed BCF output\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -L discard the PL genotype field\n"); - fprintf(stderr, " -v output potential variant sites only\n"); + fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled mutation rate [%.4lg]\n", vc.theta); fprintf(stderr, " -p FLOAT variant if P(ref|D)n_smpl); if (vc.prior_file) { @@ -245,6 +276,10 @@ int bcfview(int argc, char *argv[]) return 1; } } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); + if (vc.n1 > 0) { + bcf_p1_set_n1(p1, vc.n1); + bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); + } } if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { @@ -254,17 +289,29 @@ int bcfview(int argc, char *argv[]) idx = bcf_idx_load(argv[optind]); if (idx) { uint64_t off; - off = bcf_idx_query(idx, tid, begin, end); + off = bcf_idx_query(idx, tid, begin); + if (off == 0) { + fprintf(stderr, "[%s] no records in the query region.\n", __func__); + return 1; // FIXME: a lot of memory leaks... + } bgzf_seek(bp->fp, off, SEEK_SET); bcf_idx_destroy(idx); } } } while (vcf_read(bp, h, b) > 0) { + if (vc.flag & VC_ACGT_ONLY) { + int x; + if (b->ref[0] == 0 || b->ref[1] != 0) continue; + x = toupper(b->ref[0]); + if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; + } if (hash) { uint64_t x = (uint64_t)b->tid<<32 | b->pos; khint_t k = kh_get(set64, hash, x); + if (kh_size(hash) == 0) break; if (k == kh_end(hash)) continue; + kh_del(set64, hash, k); } if (tid >= 0) { int l = strlen(b->ref); @@ -272,19 +319,28 @@ int bcfview(int argc, char *argv[]) if (b->tid != tid || b->pos >= end) break; if (!(l > begin && end > b->pos)) continue; } - if (vc.flag & VC_CALL) { + ++n_processed; + if (vc.flag & VC_QCALL) { // output QCALL format; STOP here + bcf_2qcall(h, b); + continue; + } + if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - bcf_p1_cal(b, p1, &pr); - if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); + bcf_gl2pl(b); + bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here + if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); + if (n_processed % 100000 == 0) { + fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); + bcf_p1_dump_afs(p1); + } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); } - if (vc.flag & VC_NO_GENO) { + if (vc.flag & VC_NO_GENO) { // do not output GENO fields b->n_gi = 0; b->fmt[0] = '\0'; } vcf_write(bout, h, b); - ++n_processed; } if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);