my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
- gapstats=>\&gapstats, splitchr=>\&splitchr);
+ gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
print;
} else {
my @t = split;
- my @c = (0);
+ my @c = (0, 0);
my $n = 0;
my $s = -1;
@_ = split(":", $t[8]);
}
sub varFilter {
- my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
- getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
+ my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
die(qq/
Usage: vcfutils.pl varFilter [options] <in.vcf>
-2 FLOAT min P-value for baseQ bias [$opts{2}]
-3 FLOAT min P-value for mapQ bias [$opts{3}]
-4 FLOAT min P-value for end distance bias [$opts{4}]
+ -e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
-p print filtered variants
Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
print; next;
}
next if ($t[4] eq '.'); # skip non-var sites
+ next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
# check if the site is a SNP
my $type = 1; # SNP
if (length($t[3]) > 1) {
$flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
$flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
&& ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+ $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
+ # HWE filter
+ if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
+ my $p = 2*$1 + $2;
+ my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
+ $flt = 9 if ($f < 0);
+ }
my $score = $t[5] * 100 + $dp_alt;
my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
} else { # SNP or MNP
for my $x (@staging) {
next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
- $flt = 5;
+ if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
+ && length($x->[7]) - length($x->[6]) == 1) {
+ $x->[1] = 5;
+ } else { $flt = 5; }
last;
}
# check MNP
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
}
}
+sub vcf2fq {
+ my %opts = (d=>3, D=>100000, Q=>10, l=>5);
+ getopts('d:D:Q:l:', \%opts);
+ die(qq/
+Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -l INT INDEL filtering window [$opts{l}]
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, $last_pos, @gaps);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
+ GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
+
+ $last_chr = '';
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ ($last_chr, $last_pos) = ($t[0], 0);
+ $seq = $qual = '';
+ @gaps = ();
+ }
+ die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
+ if ($t[1] - $last_pos > 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
+ my ($ref, $alt) = ($t[3], $1);
+ my ($b, $q);
+ $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
+ if ($q < 0) {
+ $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
+ $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
+ $q = -$q;
+ } else {
+ $b = $het{"$ref$alt"};
+ $b ||= 'N';
+ }
+ $b = lc($b);
+ $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
+ $q = int($q + 33 + .499);
+ $q = chr($q <= 126? $q : 126);
+ $seq .= $b;
+ $qual .= $q;
+ } elsif ($t[4] ne '.') { # an INDEL
+ push(@gaps, [$t[1], length($t[3])]);
+ }
+ $last_pos = $t[1];
+ }
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub v2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $beg = $g->[0] > $l? $g->[0] - $l : 0;
+ my $end = $g->[0] + $g->[1] + $l;
+ $end = length($$seq) if ($end > length($$seq));
+ substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
+ }
+ print "\@$chr\n"; &v2q_print_str($seq);
+ print "+\n"; &v2q_print_str($qual);
+}
+
+sub v2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";
+ }
+}
+
sub usage {
die(qq/
Usage: vcfutils.pl <command> [<arguments>]\n
listsam list the samples
fillac fill the allele count field
qstats SNP stats stratified by QUAL
- varFilter filtering short variants
+
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
+
+ varFilter filtering short variants (*)
+ vcf2fq VCF->fastq (**)
+
+Notes: Commands with description endting with (*) may need bcftools
+ specific annotations.
\n/);
}