X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bcftools%2Fcall1.c;h=5a5c1cd1aba3e1c7e15a00638ee468186d052a52;hb=e6091454768385d722644774042e822658d20713;hp=c722b355afd8752031044ef63f98e9eef7c2876e;hpb=3f13738c659d0b49fe7ac99145cdc9fa2f507af4;p=samtools.git diff --git a/bcftools/call1.c b/bcftools/call1.c index c722b35..5a5c1cd 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -13,7 +13,6 @@ KHASH_SET_INIT_INT64(set64) #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) -#define VC_NO_PL 1 #define VC_NO_GENO 2 #define VC_BCFOUT 4 #define VC_CALL 8 @@ -21,6 +20,11 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_VCFIN 32 #define VC_UNCOMP 64 #define VC_HWE 128 +#define VC_KEEPALT 256 +#define VC_ACGT_ONLY 512 +#define VC_QCALL 1024 +#define VC_CALL_GT 2048 +#define VC_ADJLD 4096 typedef struct { int flag, prior_type, n1; @@ -130,7 +134,7 @@ static int test16(bcf1_t *b, anno16_t *a) return test16_core(anno, a); } -static void rm_info(int n_smpl, bcf1_t *b, const char *key) +static void rm_info(bcf1_t *b, const char *key) { char *p, *q; if ((p = strstr(b->info, key)) == 0) return; @@ -138,10 +142,10 @@ static void rm_info(int n_smpl, bcf1_t *b, const char *key) if (p > b->info && *(p-1) == ';') --p; memmove(p, q, b->l_str - (q - b->str)); b->l_str -= q - p; - bcf_sync(n_smpl, b); + bcf_sync(b); } -static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref) +static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) { kstring_t s; int is_var = (pr->p_ref < pref); @@ -150,35 +154,54 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated test16(b, &a); - rm_info(n_smpl, b, "I16="); + rm_info(b, "I16="); memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); - if (is_var) { - kputs(b->alt, &s); - } - kputc('\0', &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); - ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); +// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); + ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); - if (a.is_tested) ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); + if (a.is_tested) { + if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); + ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); + } if (pr->g[0] >= 0. && p_hwe <= .2) ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = r < 1e-100? 99 : -3.434 * log(r); + b->qual = r < 1e-100? 99 : -4.343 * log(r); if (b->qual > 99) b->qual = 99; - bcf_sync(n_smpl, b); + bcf_sync(b); + if (!is_var) bcf_shrink_alt(b, 1); + else if (!(flag&VC_KEEPALT)) + bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1); + if (is_var && (flag&VC_CALL_GT)) { // call individual genotype + int i, x, old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + for (i = 0; i < b->n_smpl; ++i) { + x = bcf_p1_call_gt(pa, pr->f_em, i); + ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; + ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; + } + } return is_var; } +double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + int bcfview(int argc, char *argv[]) { + extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); bcf_t *bp, *bout = 0; - bcf1_t *b; + bcf1_t *b, *blast; int c; uint64_t n_processed = 0; viewconf_t vc; @@ -190,21 +213,25 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.9; - while ((c = getopt(argc, argv, "1:l:cHGvLbSuP:t:p:")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgL")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; + case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; - case 'L': vc.flag |= VC_NO_PL; break; + case 'A': vc.flag |= VC_KEEPALT; break; case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; - case 'v': vc.flag |= VC_VARONLY; break; + case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; case 'H': vc.flag |= VC_HWE; break; + case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; + case 'Q': vc.flag |= VC_QCALL; break; + case 'L': vc.flag |= VC_ADJLD; break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -217,13 +244,18 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); fprintf(stderr, "Options: -c SNP calling\n"); + fprintf(stderr, " -v output potential variant sites only (force -c)\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -u uncompressed BCF output\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -L discard the PL genotype field\n"); fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); - fprintf(stderr, " -v output potential variant sites only\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -L calculate LD for adjacent sites\n"); + fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled mutation rate [%.4lg]\n", vc.theta); fprintf(stderr, " -p FLOAT variant if P(ref|D)n_smpl); if (vc.prior_file) { @@ -250,7 +283,10 @@ int bcfview(int argc, char *argv[]) return 1; } } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); - if (vc.n1 > 0) bcf_p1_set_n1(p1, vc.n1); + if (vc.n1 > 0) { + bcf_p1_set_n1(p1, vc.n1); + bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); + } } if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { @@ -260,17 +296,29 @@ int bcfview(int argc, char *argv[]) idx = bcf_idx_load(argv[optind]); if (idx) { uint64_t off; - off = bcf_idx_query(idx, tid, begin, end); + off = bcf_idx_query(idx, tid, begin); + if (off == 0) { + fprintf(stderr, "[%s] no records in the query region.\n", __func__); + return 1; // FIXME: a lot of memory leaks... + } bgzf_seek(bp->fp, off, SEEK_SET); bcf_idx_destroy(idx); } } } while (vcf_read(bp, h, b) > 0) { + if (vc.flag & VC_ACGT_ONLY) { + int x; + if (b->ref[0] == 0 || b->ref[1] != 0) continue; + x = toupper(b->ref[0]); + if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; + } if (hash) { uint64_t x = (uint64_t)b->tid<<32 | b->pos; khint_t k = kh_get(set64, hash, x); + if (kh_size(hash) == 0) break; if (k == kh_end(hash)) continue; + kh_del(set64, hash, k); } if (tid >= 0) { int l = strlen(b->ref); @@ -278,29 +326,48 @@ int bcfview(int argc, char *argv[]) if (b->tid != tid || b->pos >= end) break; if (!(l > begin && end > b->pos)) continue; } - if (vc.flag & VC_CALL) { + ++n_processed; + if (vc.flag & VC_QCALL) { // output QCALL format; STOP here + bcf_2qcall(h, b); + continue; + } + if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b); + if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); - if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); + if (n_processed % 100000 == 0) { + fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); + bcf_p1_dump_afs(p1); + } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; - update_bcf1(h->n_smpl, b, p1, &pr, vc.pref); + update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); } - if (vc.flag & VC_NO_GENO) { + if (vc.flag & VC_ADJLD) { // compute LD + double f[4], r2; + if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) { + kstring_t s; + s.m = s.l = 0; s.s = 0; + if (*b->info) kputc(';', &s); + ksprintf(&s, "NEIR=%.3lf;NEIF=%.3lf,%.3lf", r2, f[0]+f[2], f[0]+f[1]); + bcf_append_info(b, s.s, s.l); + free(s.s); + } + bcf_cpy(blast, b); + } + if (vc.flag & VC_NO_GENO) { // do not output GENO fields b->n_gi = 0; b->fmt[0] = '\0'; } vcf_write(bout, h, b); - ++n_processed; } if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); bcf_hdr_destroy(h); - bcf_destroy(b); + bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); if (hash) kh_destroy(set64, hash); if (vc.fn_list) free(vc.fn_list); if (p1) bcf_p1_destroy(p1); return 0; } -