13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
107 my %opts = (r=>'', s=>0.02, v=>undef);
108 getopts('r:s:v', \%opts);
109 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
110 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
111 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
113 my $is_vcf = defined($opts{v})? 1 : 0;
114 if ($opts{r}) { # read the reference positions
116 open($fh, $opts{r}) || die;
121 $h{$t[0],$t[1]} = $t[4];
123 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
128 my $hsize = scalar(keys %h);
133 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
134 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
135 my @s = split(',', $t[4]);
136 $t[5] = 3 if ($t[5] < 0);
137 next if (length($s[0]) != 1);
141 my $aa = $h{$t[0],$t[1]};
143 my @aaa = split(",", $aa);
145 $hit = 1 if ($_ eq $s[0]);
149 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
151 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
153 push(@a, [-1, 0, 0, 0]); # end marker
154 die("[qstats] No SNP data!\n") if (@a == 0);
155 @a = sort {$b->[0]<=>$a->[0]} @a;
158 my @c = (0, 0, 0, 0);
162 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
164 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
165 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
166 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
167 my $a = $c[1] - $lc[1];
168 my $b = $c[2] - $lc[2];
169 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
170 print join("\t", $last, @c, @x), "\n";
171 $next = $c[0]/@a + $opts{s};
172 $lc[1] = $c[1]; $lc[2] = $c[2];
174 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
180 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
181 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
183 Usage: vcfutils.pl varFilter [options] <in.vcf>
185 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
186 -q INT minimum RMS mapping quality for gaps [$opts{q}]
187 -d INT minimum read depth [$opts{d}]
188 -D INT maximum read depth [$opts{D}]
190 -G INT min indel score for nearby SNP filtering [$opts{G}]
191 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
193 -W INT window size for filtering dense SNPs [$opts{W}]
194 -N INT max number of SNPs in a window [$opts{N}]
196 -l INT window size for filtering adjacent gaps [$opts{l}]
198 -p print filtered variants
199 \n/) if (@ARGV == 0 && -t STDIN);
201 # calculate the window size
202 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
203 my $max_dist = $ol > $ow? $ol : $ow;
204 $max_dist = $oW if ($max_dist < $oW);
206 my @staging; # (indel_filtering_score, flt_tag)
210 next if ($t[4] eq '.'); # skip non-var sites
212 if (length($t[3]) > 1) {
215 my @s = split(',', $t[4]);
217 $is_snp = 0 if (length > 1);
220 # clear the out-of-range elements
222 # Still on the same chromosome and the first element's window still affects this position?
223 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
224 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
226 my ($flt, $score) = (0, -1);
228 # collect key annotations
229 my ($dp, $mq, $af) = (-1, -1, 1);
230 if ($t[7] =~ /DP=(\d+)/i) {
232 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
233 $dp = $1 + $2 + $3 + $4;
235 if ($t[7] =~ /MQ=(\d+)/i) {
238 if ($t[7] =~ /AF=([^\s;=]+)/i) {
240 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
245 if ($dp < $opts{d}) {
247 } elsif ($dp > $opts{D}) {
252 # site dependent filters
255 if (!$is_snp) { # an indel
256 # If deletion, remember the length of the deletion
257 $dlen = length($t[3]) - 1;
258 $flt = 1 if ($mq < $opts{q});
260 if ($t[5] >= $opts{G}) {
261 for my $x (@staging) {
262 # Is it a SNP and is it outside the SNP filter window?
263 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
264 $x->[1] = 5 if ($x->[1] == 0);
267 # the indel filtering score
269 # check the staging list for indel filtering
270 for my $x (@staging) {
271 # Is it a SNP and is it outside the gap filter window
272 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
273 if ($x->[0] < $score) {
280 $flt = 1 if ($mq < $opts{Q});
281 # check adjacent SNPs
283 for my $x (@staging) {
284 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
286 # filtering is necessary
289 for my $x (@staging) {
290 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
292 } else { # then check gap filter
293 for my $x (@staging) {
294 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
295 if ($x->[0] >= $opts{G}) {
302 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
304 # output the last few elements in the staging list
306 varFilter_aux(shift @staging, $opts{p});
311 my ($first, $is_print) = @_;
312 if ($first->[1] == 0) {
313 print join("\t", @$first[3 .. @$first-1]), "\n";
314 } elsif ($is_print) {
315 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
320 my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
321 getopts('d:D:1:2:3:4:Q:q:', \%opts);
323 Usage: vcfutils.pl filter4vcf [options] <in.vcf>
325 Options: -d INT min total depth (given DP or DP4) [$opts{d}]
326 -D INT max total depth [$opts{D}]
327 -q INT min SNP quality [$opts{q}]
328 -Q INT min RMS mapQ (given MQ) [$opts{Q}]
329 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
330 -2 FLOAT min P-value for baseQ bias [$opts{2}]
331 -3 FLOAT min P-value for mapQ bias [$opts{3}]
332 -4 FLOAT min P-value for end distance bias [$opts{4}]\n
333 /) if (@ARGV == 0 && -t STDIN);
335 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
340 next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
342 $depth = $1 if (/DP=(\d+)/);
343 $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
344 next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
345 next if (/MQ=(\d+)/ && $1 < $opts{Q});
347 next if ($t[5] >= 0 && $t[5] < $opts{q});
349 my @s = split(',', $t[4]);
350 ++$n[1] if ($ts{$t[3].$s[0]});
356 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
357 print "##fileformat=VCFv4.0\n";
358 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
361 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
366 $t[9] = reverse($t[9]);
367 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
369 my @a = split("/", $t[9]);
371 push(@alt, $_) if ($_ ne $alt[0]);
377 $alt[$_] = "N$alt[$_]";
380 my $ref = shift(@alt);
381 my $af = $t[13] > 0? ";AF=$t[13]" : '';
382 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
383 my $info = "molType=$t[10];class=$t[11]$valid$af";
384 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
389 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
390 my $fn = shift(@ARGV);
392 warn("Parsing UCSC SNPs...\n");
394 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
397 next if ($t[3] - $t[2] != 1); # not SNP
398 @{$map{$t[4]}} = @t[1,3,7];
402 warn("Writing VCF...\n");
403 print "##fileformat=VCFv4.0\n";
406 if ($t[0] eq 'rs#') { # the first line
407 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
409 next unless ($map{$t[0]});
410 next if (length($t[1]) != 3); # skip non-SNPs
411 my $a = \@{$map{$t[0]}};
413 my @u = split('/', $t[1]);
415 $u[1] = $u[0]; $u[0] = $ref;
416 } elsif ($u[0] ne $ref) { next; }
419 $w{$u[0]} = 0; $w{$u[1]} = 1;
420 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
426 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
427 if (!defined($a[0]) || !defined($a[1])) {
431 push(@s, "$a[0]/$a[1]");
435 print join("\t", @s), "\n";
442 Usage: vcfutils.pl <command> [<arguments>]\n
443 Command: subsam get a subset of samples
444 listsam list the samples
445 fillac fill the allele count field
446 qstats SNP stats stratified by QUAL
447 varFilter filtering short variants
448 filter4vcf filtering VCFs produced by samtools+bcftools
449 hapmap2vcf convert the hapmap format to VCF
450 ucscsnp2vcf convert UCSC SNP SQL dump to VCF