X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bcftools%2Fvcfutils.pl;h=c1177eacb9603cc2cb9f93ab27d54b5add1e0779;hb=165f1d8cb88f04a48b7da3f4e5e5fd2a4f074972;hp=0cf73a30d640f6e0747e2cc4b9af8e4bbfb815a7;hpb=e6091454768385d722644774042e822658d20713;p=samtools.git diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index 0cf73a3..c1177ea 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -10,11 +10,10 @@ use Getopt::Std; exit; sub main { - my $version = '0.1.0'; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, - hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats); + hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@ -199,8 +198,8 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # } sub varFilter { - my %opts = (d=>1, D=>10000, a=>2, l=>10, Q=>10, s=>100, w=>10, p=>undef, F=>.001); - getopts('pd:D:l:Q:w:G:F:a:', \%opts); + my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4); + getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] @@ -209,19 +208,28 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] -D INT maximum read depth [$opts{D}] -a INT minimum number of alternate bases [$opts{a}] -w INT SNP within INT bp around a gap to be filtered [$opts{w}] - -l INT window size for filtering adjacent gaps [$opts{l}] + -W INT window size for filtering adjacent gaps [$opts{W}] + -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] + -2 FLOAT min P-value for baseQ bias [$opts{2}] + -3 FLOAT min P-value for mapQ bias [$opts{3}] + -4 FLOAT min P-value for end distance bias [$opts{4}] -p print filtered variants + +Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. \n/) if (@ARGV == 0 && -t STDIN); # calculate the window size - my ($ol, $ow) = ($opts{l}, $opts{w}); + my ($ol, $ow) = ($opts{W}, $opts{w}); my $max_dist = $ol > $ow? $ol : $ow; # the core loop my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) while (<>) { my @t = split; - next if (/^#/); + if (/^#/) { + print; next; + } next if ($t[4] eq '.'); # skip non-var sites + # check if the site is a SNP my $is_snp = 1; if (length($t[3]) > 1) { $is_snp = 0; @@ -238,7 +246,6 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much } my $flt = 0; - # parse annotations my ($dp, $mq, $dp_alt) = (-1, -1, -1); if ($t[7] =~ /DP=(\d+)/i) { @@ -258,6 +265,8 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] } $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); + $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ + && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); # site dependent filters my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs @@ -300,46 +309,7 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; - } -} - -sub filter4vcf { - my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3); - getopts('d:D:1:2:3:4:Q:q:', \%opts); - die(qq/ -Usage: vcfutils.pl filter4vcf [options] - -Options: -d INT min total depth (given DP or DP4) [$opts{d}] - -D INT max total depth [$opts{D}] - -q INT min SNP quality [$opts{q}] - -Q INT min RMS mapQ (given MQ) [$opts{Q}] - -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] - -2 FLOAT min P-value for baseQ bias [$opts{2}] - -3 FLOAT min P-value for mapQ bias [$opts{3}] - -4 FLOAT min P-value for end distance bias [$opts{4}]\n -/) if (@ARGV == 0 && -t STDIN); - - my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); - - my @n = (0, 0); - while (<>) { - if (/^#/) { - print; - next; - } - next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); - my $depth = -1; - $depth = $1 if (/DP=(\d+)/); - $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/); - next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D})); - next if (/MQ=(\d+)/ && $1 < $opts{Q}); - my @t = split; - next if ($t[5] >= 0 && $t[5] < $opts{q}); - ++$n[0]; - my @s = split(',', $t[4]); - ++$n[1] if ($ts{$t[3].$s[0]}); - print; + print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } @@ -436,7 +406,6 @@ Command: subsam get a subset of samples fillac fill the allele count field qstats SNP stats stratified by QUAL varFilter filtering short variants - filter4vcf filtering VCFs produced by samtools+bcftools hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF \n/);