&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
- hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf);
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
print;
} else {
my @t = split;
- my @c;
+ my @c = (0);
my $n = 0;
- $c[1] = 0;
+ my $s = -1;
+ @_ = split(":", $t[8]);
+ for (0 .. $#_) {
+ if ($_[$_] eq 'GT') { $s = $_; last; }
+ }
+ if ($s < 0) {
+ print join("\t", @t), "\n";
+ next;
+ }
for (9 .. $#t) {
- if ($t[$_] =~ /^(\d+).(\d+)/) {
- ++$c[$1]; ++$c[$2];
+ if ($t[$_] =~ /^0,0,0/) {
+ } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
+ ++$c[$2]; ++$c[$3];
$n += 2;
}
}
}
}
+sub ldstats {
+ my %opts = (s=>0.01);
+ getopts('ps:', \%opts);
+ die("Usage: vcfutils.pl ldstats [-s $opts{s}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
+ my ($lastchr, $lastpos) = ('', 0);
+ my @a;
+ my $is_print = defined($opts{p})? 1 : 0;
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ if ($t[0] ne $lastchr) {
+ $lastchr = $t[0];
+ } elsif (/NEIR=([\d\.]+)/) {
+ push(@a, [$t[1] - $lastpos, $1, $t[1]]);
+ }
+ $lastpos = $t[1];
+ }
+ my $max = 1000000000;
+ push(@a, [$max, 0, 0]); # end marker
+ @a = sort {$a->[0]<=>$b->[0]} @a;
+ my $next = $opts{s};
+ my $last = $a[0];
+ my @c = (0, 0, 0, 0);
+ for my $p (@a) {
+ print STDERR "$p->[0]\t$p->[1]\t$p->[2]\n" if ($is_print);
+ if ($p->[0] == $max || ($p->[0] != $last && $c[0]/@a > $next)) {
+ printf("%d\t%.2f\t%.4f\n", $c[1], $c[2]/$c[1], $c[3]/$c[1]);
+ $c[1] = $c[2] = $c[3] = 0;
+ $next = $c[0]/@a + $opts{s};
+ }
+ ++$c[0]; ++$c[1]; $c[2] += $p->[0]; $c[3] += $p->[1];
+ $last = $p->[0];
+ }
+}
+
sub qstats {
my %opts = (r=>'', s=>0.02, v=>undef);
getopts('r:s:v', \%opts);
}
}
+sub filter4vcf {
+ my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
+ getopts('d:D:1:2:3:4:Q:q:', \%opts);
+ die(qq/
+Usage: vcfutils.pl filter4vcf [options] <in.vcf>
+
+Options: -d INT min total depth (given DP or DP4) [$opts{d}]
+ -D INT max total depth [$opts{D}]
+ -q INT min SNP quality [$opts{q}]
+ -Q INT min RMS mapQ (given MQ) [$opts{Q}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]\n
+/) if (@ARGV == 0 && -t STDIN);
+
+ my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
+
+ my @n = (0, 0);
+ while (<>) {
+ if (/^#/) {
+ print;
+ next;
+ }
+ next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+ my $depth = -1;
+ $depth = $1 if (/DP=(\d+)/);
+ $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
+ next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
+ next if (/MQ=(\d+)/ && $1 < $opts{Q});
+ my @t = split;
+ next if ($t[5] >= 0 && $t[5] < $opts{q});
+ ++$n[0];
+ my @s = split(',', $t[4]);
+ ++$n[1] if ($ts{$t[3].$s[0]});
+ print;
+ }
+}
+
sub ucscsnp2vcf {
die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
print "##fileformat=VCFv4.0\n";
fillac fill the allele count field
qstats SNP stats stratified by QUAL
varFilter filtering short variants
+ filter4vcf filtering VCFs produced by samtools+bcftools
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
\n/);