exit;
sub main {
- my $version = '0.1.0';
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
- hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
+ gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
+sub splitchr {
+ my %opts = (l=>5000000);
+ getopts('l:', \%opts);
+ my $l = $opts{l};
+ die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ my $last = 0;
+ for (my $i = 0; $i < $t[1];) {
+ my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
+ print "$t[0]:".($i+1)."-$e\n";
+ $i = $e;
+ }
+ }
+}
+
sub subsam {
die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
my ($fh, %h);
print;
} else {
my @t = split;
- my @c = (0);
+ my @c = (0, 0);
my $n = 0;
my $s = -1;
@_ = split(":", $t[8]);
next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
$t[3] = uc($t[3]); $t[4] = uc($t[4]);
my @s = split(',', $t[4]);
- $t[5] = 3 if ($t[5] < 0);
+ $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
next if (length($s[0]) != 1);
my $hit;
if ($is_vcf) {
}
sub varFilter {
- my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
- getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
+ my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
die(qq/
Usage: vcfutils.pl varFilter [options] <in.vcf>
Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
- -q INT minimum RMS mapping quality for gaps [$opts{q}]
-d INT minimum read depth [$opts{d}]
-D INT maximum read depth [$opts{D}]
-
- -G INT min indel score for nearby SNP filtering [$opts{G}]
+ -a INT minimum number of alternate bases [$opts{a}]
-w INT SNP within INT bp around a gap to be filtered [$opts{w}]
-
- -W INT window size for filtering dense SNPs [$opts{W}]
- -N INT max number of SNPs in a window [$opts{N}]
-
- -l INT window size for filtering adjacent gaps [$opts{l}]
-
+ -W INT window size for filtering adjacent gaps [$opts{W}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]
+ -e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
-p print filtered variants
+
+Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
\n/) if (@ARGV == 0 && -t STDIN);
# calculate the window size
- my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
+ my ($ol, $ow) = ($opts{W}, $opts{w});
my $max_dist = $ol > $ow? $ol : $ow;
- $max_dist = $oW if ($max_dist < $oW);
# the core loop
- my @staging; # (indel_filtering_score, flt_tag)
+ my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
while (<>) {
my @t = split;
- next if (/^#/);
+ if (/^#/) {
+ print; next;
+ }
next if ($t[4] eq '.'); # skip non-var sites
- my $is_snp = 1;
+ next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
+ # check if the site is a SNP
+ my $type = 1; # SNP
if (length($t[3]) > 1) {
- $is_snp = 0;
+ $type = 2; # MNP
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length != length($t[3]));
+ }
} else {
my @s = split(',', $t[4]);
for (@s) {
- $is_snp = 0 if (length > 1);
+ $type = 3 if (length > 1);
}
}
# clear the out-of-range elements
last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
}
- my ($flt, $score) = (0, -1);
-
- # collect key annotations
- my ($dp, $mq, $af) = (-1, -1, 1);
- if ($t[7] =~ /DP=(\d+)/i) {
- $dp = $1;
- } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
+ my $flt = 0;
+ # parse annotations
+ my ($dp, $mq, $dp_alt) = (-1, -1, -1);
+ if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
$dp = $1 + $2 + $3 + $4;
+ $dp_alt = $3 + $4;
}
- if ($t[7] =~ /MQ=(\d+)/i) {
- $mq = $1;
- }
- if ($t[7] =~ /AF=([^\s;=]+)/i) {
- $af = $1;
- } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
- $af = $1;
+ if ($t[7] =~ /DP=(\d+)/i) {
+ $dp = $1;
}
- # the depth filter
+ $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
+ # the depth and mapQ filter
if ($dp >= 0) {
if ($dp < $opts{d}) {
$flt = 2;
$flt = 3;
}
}
+ $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
+ $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
+ $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
+ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+ $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
+ # HWE filter
+ if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
+ my $p = 2*$1 + $2;
+ my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
+ $flt = 9 if ($f < 0);
+ }
- # site dependent filters
- my $dlen = 0;
+ my $score = $t[5] * 100 + $dp_alt;
+ my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
if ($flt == 0) {
- if (!$is_snp) { # an indel
- # If deletion, remember the length of the deletion
- $dlen = length($t[3]) - 1;
- $flt = 1 if ($mq < $opts{q});
- # filtering SNPs
- if ($t[5] >= $opts{G}) {
- for my $x (@staging) {
- # Is it a SNP and is it outside the SNP filter window?
- next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
- $x->[1] = 5 if ($x->[1] == 0);
- }
+ if ($type == 3) { # an indel
+ # filtering SNPs and MNPs
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ $x->[1] = 5;
}
- # the indel filtering score
- $score = $t[5];
# check the staging list for indel filtering
for my $x (@staging) {
- # Is it a SNP and is it outside the gap filter window
- next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
- if ($x->[0] < $score) {
+ next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0]>>2 < $score) {
$x->[1] = 6;
} else {
$flt = 6; last;
}
}
- } else { # a SNP
- $flt = 1 if ($mq < $opts{Q});
- # check adjacent SNPs
- my $k = 1;
+ } else { # SNP or MNP
for my $x (@staging) {
- ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
+ next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
+ if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
+ && length($x->[7]) - length($x->[6]) == 1) {
+ $x->[1] = 5;
+ } else { $flt = 5; }
+ last;
}
- # filtering is necessary
- if ($k > $opts{N}) {
- $flt = 4;
- for my $x (@staging) {
- $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
- }
- } else { # then check gap filter
- for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
- if ($x->[0] >= $opts{G}) {
- $flt = 5; last;
- }
+ # check MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 8;
+ } else {
+ $flt = 8; last;
}
}
}
}
- push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
+ push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
}
# output the last few elements in the staging list
while (@staging) {
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
-sub filter4vcf {
- my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
- getopts('d:D:1:2:3:4:Q:q:', \%opts);
- die(qq/
-Usage: vcfutils.pl filter4vcf [options] <in.vcf>
-
-Options: -d INT min total depth (given DP or DP4) [$opts{d}]
- -D INT max total depth [$opts{D}]
- -q INT min SNP quality [$opts{q}]
- -Q INT min RMS mapQ (given MQ) [$opts{Q}]
- -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
- -2 FLOAT min P-value for baseQ bias [$opts{2}]
- -3 FLOAT min P-value for mapQ bias [$opts{3}]
- -4 FLOAT min P-value for end distance bias [$opts{4}]\n
-/) if (@ARGV == 0 && -t STDIN);
-
- my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
-
- my @n = (0, 0);
+sub gapstats {
+ my (@c0, @c1);
+ $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
while (<>) {
- if (/^#/) {
- print;
- next;
- }
- next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- my $depth = -1;
- $depth = $1 if (/DP=(\d+)/);
- $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
- next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
- next if (/MQ=(\d+)/ && $1 < $opts{Q});
+ next if (/^#/);
my @t = split;
- next if ($t[5] >= 0 && $t[5] < $opts{q});
- ++$n[0];
+ next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
my @s = split(',', $t[4]);
- ++$n[1] if ($ts{$t[3].$s[0]});
- print;
+ for my $x (@s) {
+ my $l = length($x) - length($t[3]) + 5000;
+ if ($x =~ /^-/) {
+ $l = -(length($x) - 1) + 5000;
+ } elsif ($x =~ /^\+/) {
+ $l = length($x) - 1 + 5000;
+ }
+ $c0[$l] += 1 / @s;
+ }
+ }
+ for (my $i = 0; $i < 10000; ++$i) {
+ next if ($c0[$i] == 0);
+ $c1[0] += $c0[$i];
+ $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
+ printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
}
+ printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
}
sub ucscsnp2vcf {
}
}
+sub vcf2fq {
+ my %opts = (d=>3, D=>100000, Q=>10, l=>5);
+ getopts('d:D:Q:l:', \%opts);
+ die(qq/
+Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -l INT INDEL filtering window [$opts{l}]
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, $last_pos, @gaps);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
+ GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
+
+ $last_chr = '';
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ ($last_chr, $last_pos) = ($t[0], 0);
+ $seq = $qual = '';
+ @gaps = ();
+ }
+ die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
+ if ($t[1] - $last_pos > 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
+ my ($ref, $alt) = ($t[3], $1);
+ my ($b, $q);
+ $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
+ if ($q < 0) {
+ $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
+ $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
+ $q = -$q;
+ } else {
+ $b = $het{"$ref$alt"};
+ $b ||= 'N';
+ }
+ $b = lc($b);
+ $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
+ $q = int($q + 33 + .499);
+ $q = chr($q <= 126? $q : 126);
+ $seq .= $b;
+ $qual .= $q;
+ } elsif ($t[4] ne '.') { # an INDEL
+ push(@gaps, [$t[1], length($t[3])]);
+ }
+ $last_pos = $t[1];
+ }
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub v2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $beg = $g->[0] > $l? $g->[0] - $l : 0;
+ my $end = $g->[0] + $g->[1] + $l;
+ $end = length($$seq) if ($end > length($$seq));
+ substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
+ }
+ print "\@$chr\n"; &v2q_print_str($seq);
+ print "+\n"; &v2q_print_str($qual);
+}
+
+sub v2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";
+ }
+}
+
sub usage {
die(qq/
Usage: vcfutils.pl <command> [<arguments>]\n
listsam list the samples
fillac fill the allele count field
qstats SNP stats stratified by QUAL
- varFilter filtering short variants
- filter4vcf filtering VCFs produced by samtools+bcftools
+
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
+
+ varFilter filtering short variants (*)
+ vcf2fq VCF->fastq (**)
+
+Notes: Commands with description endting with (*) may need bcftools
+ specific annotations.
\n/);
}