From 5f5ed2477797cbd8b0c3cdd16d8abbaa2319d300 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 12 Dec 2010 01:05:42 +0000 Subject: [PATCH] compute max SP and max GQ from sample genotypes --- bcftools/bcfutils.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ bcftools/call1.c | 10 +++++++--- bcftools/vcfutils.pl | 7 ++++--- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 3c6ed73..e6a132b 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -1,4 +1,5 @@ #include +#include #include "bcf.h" #include "kstring.h" #include "khash.h" @@ -131,3 +132,49 @@ int bcf_fix_gt(bcf1_t *b) b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; return 0; } + +static void *locate_field(const bcf1_t *b, const char *fmt, int l) +{ + int i; + uint32_t tmp; + tmp = bcf_str2int(fmt, l); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + return i == b->n_gi? 0 : b->gi[i].data; +} + +int bcf_anno_max(bcf1_t *b) +{ + int k, max_gq, max_sp, n_het; + kstring_t str; + uint8_t *gt, *gq, *sp; + max_gq = max_sp = n_het = 0; + gt = locate_field(b, "GT", 2); + if (gt == 0) return -1; + gq = locate_field(b, "GQ", 2); + sp = locate_field(b, "SP", 2); + if (sp) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; + if (gq) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; + for (k = 0; k < b->n_smpl; ++k) { + int a1, a2; + a1 = gt[k]&7; a2 = gt[k]>>3&7; + if ((!a1 && a2) || (!a2 && a1)) { // a het + if (gq == 0) ++n_het; + else if (gq[k] >= 20) ++n_het; + } + } + if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); + if (max_sp < 0) max_sp = 0; + memset(&str, 0, sizeof(kstring_t)); + if (*b->info) kputc(';', &str); + ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); + bcf_append_info(b, str.s, str.l); + free(str.s); + return 0; +} diff --git a/bcftools/call1.c b/bcftools/call1.c index adf7a52..76eabf5 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -26,6 +26,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_CALL_GT 2048 #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 +#define VC_ANNO_MAX 16384 typedef struct { int flag, prior_type, n1; @@ -235,6 +236,7 @@ int bcfview(int argc, char *argv[]) extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); extern int bcf_fix_gt(bcf1_t *b); + extern int bcf_anno_max(bcf1_t *b); bcf_t *bp, *bout = 0; bcf1_t *b, *blast; int c; @@ -249,7 +251,7 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IM")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; @@ -264,6 +266,7 @@ int bcfview(int argc, char *argv[]) case 'H': vc.flag |= VC_HWE; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; + case 'M': vc.flag |= VC_ANNO_MAX; break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; @@ -400,11 +403,12 @@ int bcfview(int argc, char *argv[]) } bcf_cpy(blast, b); } + if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b); if (vc.flag & VC_NO_GENO) { // do not output GENO fields b->n_gi = 0; b->fmt[0] = '\0'; - } - bcf_fix_gt(b); + b->l_str = b->fmt - b->str + 1; + } else bcf_fix_gt(b); vcf_write(bout, h, b); } if (vc.prior_file) free(vc.prior_file); diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index 63509e8..5df8b0c 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -215,8 +215,8 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # } sub varFilter { - my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4); - getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts); + my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000); + getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] @@ -289,6 +289,7 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 >= $opts{G}) || (/MXSP=(\d+)/ && $1 < $opts{S}))); my $score = $t[5] * 100 + $dp_alt; my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs @@ -338,7 +339,7 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } -- 2.39.2