exit;
sub main {
- my $version = '0.1.0';
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
- hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
}
sub varFilter {
- my %opts = (d=>1, D=>10000, a=>2, l=>10, Q=>10, s=>100, w=>10, p=>undef, F=>.001);
- getopts('pd:D:l:Q:w:G:F:a:', \%opts);
+ my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
die(qq/
Usage: vcfutils.pl varFilter [options] <in.vcf>
-D INT maximum read depth [$opts{D}]
-a INT minimum number of alternate bases [$opts{a}]
-w INT SNP within INT bp around a gap to be filtered [$opts{w}]
- -l INT window size for filtering adjacent gaps [$opts{l}]
+ -W INT window size for filtering adjacent gaps [$opts{W}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]
-p print filtered variants
+
+Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
\n/) if (@ARGV == 0 && -t STDIN);
# calculate the window size
- my ($ol, $ow) = ($opts{l}, $opts{w});
+ my ($ol, $ow) = ($opts{W}, $opts{w});
my $max_dist = $ol > $ow? $ol : $ow;
# the core loop
my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
while (<>) {
my @t = split;
- next if (/^#/);
+ if (/^#/) {
+ print; next;
+ }
next if ($t[4] eq '.'); # skip non-var sites
+ # check if the site is a SNP
my $is_snp = 1;
if (length($t[3]) > 1) {
$is_snp = 0;
varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
}
my $flt = 0;
-
# parse annotations
my ($dp, $mq, $dp_alt) = (-1, -1, -1);
if ($t[7] =~ /DP=(\d+)/i) {
}
$flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
$flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
+ $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
+ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
# site dependent filters
my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
- }
-}
-
-sub filter4vcf {
- my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
- getopts('d:D:1:2:3:4:Q:q:', \%opts);
- die(qq/
-Usage: vcfutils.pl filter4vcf [options] <in.vcf>
-
-Options: -d INT min total depth (given DP or DP4) [$opts{d}]
- -D INT max total depth [$opts{D}]
- -q INT min SNP quality [$opts{q}]
- -Q INT min RMS mapQ (given MQ) [$opts{Q}]
- -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
- -2 FLOAT min P-value for baseQ bias [$opts{2}]
- -3 FLOAT min P-value for mapQ bias [$opts{3}]
- -4 FLOAT min P-value for end distance bias [$opts{4}]\n
-/) if (@ARGV == 0 && -t STDIN);
-
- my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
-
- my @n = (0, 0);
- while (<>) {
- if (/^#/) {
- print;
- next;
- }
- next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- my $depth = -1;
- $depth = $1 if (/DP=(\d+)/);
- $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
- next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
- next if (/MQ=(\d+)/ && $1 < $opts{Q});
- my @t = split;
- next if ($t[5] >= 0 && $t[5] < $opts{q});
- ++$n[0];
- my @s = split(',', $t[4]);
- ++$n[1] if ($ts{$t[3].$s[0]});
- print;
+ print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
fillac fill the allele count field
qstats SNP stats stratified by QUAL
varFilter filtering short variants
- filter4vcf filtering VCFs produced by samtools+bcftools
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
\n/);