13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
107 my %opts = (s=>0.01);
108 getopts('ps:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-s $opts{s}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my ($lastchr, $lastpos) = ('', 0);
112 my $is_print = defined($opts{p})? 1 : 0;
116 if ($t[0] ne $lastchr) {
118 } elsif (/NEIR=([\d\.]+)/) {
119 push(@a, [$t[1] - $lastpos, $1, $t[1]]);
123 my $max = 1000000000;
124 push(@a, [$max, 0, 0]); # end marker
125 @a = sort {$a->[0]<=>$b->[0]} @a;
128 my @c = (0, 0, 0, 0);
130 print STDERR "$p->[0]\t$p->[1]\t$p->[2]\n" if ($is_print);
131 if ($p->[0] == $max || ($p->[0] != $last && $c[0]/@a > $next)) {
132 printf("%d\t%.2f\t%.4f\n", $c[1], $c[2]/$c[1], $c[3]/$c[1]);
133 $c[1] = $c[2] = $c[3] = 0;
134 $next = $c[0]/@a + $opts{s};
136 ++$c[0]; ++$c[1]; $c[2] += $p->[0]; $c[3] += $p->[1];
142 my %opts = (r=>'', s=>0.02, v=>undef);
143 getopts('r:s:v', \%opts);
144 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
145 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
146 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
148 my $is_vcf = defined($opts{v})? 1 : 0;
149 if ($opts{r}) { # read the reference positions
151 open($fh, $opts{r}) || die;
156 $h{$t[0],$t[1]} = $t[4];
158 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
163 my $hsize = scalar(keys %h);
168 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
169 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
170 my @s = split(',', $t[4]);
171 $t[5] = 3 if ($t[5] < 0);
172 next if (length($s[0]) != 1);
176 my $aa = $h{$t[0],$t[1]};
178 my @aaa = split(",", $aa);
180 $hit = 1 if ($_ eq $s[0]);
184 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
186 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
188 push(@a, [-1, 0, 0, 0]); # end marker
189 die("[qstats] No SNP data!\n") if (@a == 0);
190 @a = sort {$b->[0]<=>$a->[0]} @a;
193 my @c = (0, 0, 0, 0);
197 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
199 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
200 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
201 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
202 my $a = $c[1] - $lc[1];
203 my $b = $c[2] - $lc[2];
204 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
205 print join("\t", $last, @c, @x), "\n";
206 $next = $c[0]/@a + $opts{s};
207 $lc[1] = $c[1]; $lc[2] = $c[2];
209 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
215 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
216 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
218 Usage: vcfutils.pl varFilter [options] <in.vcf>
220 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
221 -q INT minimum RMS mapping quality for gaps [$opts{q}]
222 -d INT minimum read depth [$opts{d}]
223 -D INT maximum read depth [$opts{D}]
225 -G INT min indel score for nearby SNP filtering [$opts{G}]
226 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
228 -W INT window size for filtering dense SNPs [$opts{W}]
229 -N INT max number of SNPs in a window [$opts{N}]
231 -l INT window size for filtering adjacent gaps [$opts{l}]
233 -p print filtered variants
234 \n/) if (@ARGV == 0 && -t STDIN);
236 # calculate the window size
237 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
238 my $max_dist = $ol > $ow? $ol : $ow;
239 $max_dist = $oW if ($max_dist < $oW);
241 my @staging; # (indel_filtering_score, flt_tag)
245 next if ($t[4] eq '.'); # skip non-var sites
247 if (length($t[3]) > 1) {
250 my @s = split(',', $t[4]);
252 $is_snp = 0 if (length > 1);
255 # clear the out-of-range elements
257 # Still on the same chromosome and the first element's window still affects this position?
258 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
259 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
261 my ($flt, $score) = (0, -1);
263 # collect key annotations
264 my ($dp, $mq, $af) = (-1, -1, 1);
265 if ($t[7] =~ /DP=(\d+)/i) {
267 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
268 $dp = $1 + $2 + $3 + $4;
270 if ($t[7] =~ /MQ=(\d+)/i) {
273 if ($t[7] =~ /AF=([^\s;=]+)/i) {
275 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
280 if ($dp < $opts{d}) {
282 } elsif ($dp > $opts{D}) {
287 # site dependent filters
290 if (!$is_snp) { # an indel
291 # If deletion, remember the length of the deletion
292 $dlen = length($t[3]) - 1;
293 $flt = 1 if ($mq < $opts{q});
295 if ($t[5] >= $opts{G}) {
296 for my $x (@staging) {
297 # Is it a SNP and is it outside the SNP filter window?
298 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
299 $x->[1] = 5 if ($x->[1] == 0);
302 # the indel filtering score
304 # check the staging list for indel filtering
305 for my $x (@staging) {
306 # Is it a SNP and is it outside the gap filter window
307 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
308 if ($x->[0] < $score) {
315 $flt = 1 if ($mq < $opts{Q});
316 # check adjacent SNPs
318 for my $x (@staging) {
319 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
321 # filtering is necessary
324 for my $x (@staging) {
325 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
327 } else { # then check gap filter
328 for my $x (@staging) {
329 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
330 if ($x->[0] >= $opts{G}) {
337 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
339 # output the last few elements in the staging list
341 varFilter_aux(shift @staging, $opts{p});
346 my ($first, $is_print) = @_;
347 if ($first->[1] == 0) {
348 print join("\t", @$first[3 .. @$first-1]), "\n";
349 } elsif ($is_print) {
350 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
355 my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
356 getopts('d:D:1:2:3:4:Q:q:', \%opts);
358 Usage: vcfutils.pl filter4vcf [options] <in.vcf>
360 Options: -d INT min total depth (given DP or DP4) [$opts{d}]
361 -D INT max total depth [$opts{D}]
362 -q INT min SNP quality [$opts{q}]
363 -Q INT min RMS mapQ (given MQ) [$opts{Q}]
364 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
365 -2 FLOAT min P-value for baseQ bias [$opts{2}]
366 -3 FLOAT min P-value for mapQ bias [$opts{3}]
367 -4 FLOAT min P-value for end distance bias [$opts{4}]\n
368 /) if (@ARGV == 0 && -t STDIN);
370 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
378 next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
380 $depth = $1 if (/DP=(\d+)/);
381 $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
382 next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
383 next if (/MQ=(\d+)/ && $1 < $opts{Q});
385 next if ($t[5] >= 0 && $t[5] < $opts{q});
387 my @s = split(',', $t[4]);
388 ++$n[1] if ($ts{$t[3].$s[0]});
394 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
395 print "##fileformat=VCFv4.0\n";
396 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
399 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
404 $t[9] = reverse($t[9]);
405 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
407 my @a = split("/", $t[9]);
409 push(@alt, $_) if ($_ ne $alt[0]);
415 $alt[$_] = "N$alt[$_]";
418 my $ref = shift(@alt);
419 my $af = $t[13] > 0? ";AF=$t[13]" : '';
420 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
421 my $info = "molType=$t[10];class=$t[11]$valid$af";
422 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
427 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
428 my $fn = shift(@ARGV);
430 warn("Parsing UCSC SNPs...\n");
432 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
435 next if ($t[3] - $t[2] != 1); # not SNP
436 @{$map{$t[4]}} = @t[1,3,7];
440 warn("Writing VCF...\n");
441 print "##fileformat=VCFv4.0\n";
444 if ($t[0] eq 'rs#') { # the first line
445 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
447 next unless ($map{$t[0]});
448 next if (length($t[1]) != 3); # skip non-SNPs
449 my $a = \@{$map{$t[0]}};
451 my @u = split('/', $t[1]);
453 $u[1] = $u[0]; $u[0] = $ref;
454 } elsif ($u[0] ne $ref) { next; }
457 $w{$u[0]} = 0; $w{$u[1]} = 1;
458 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
464 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
465 if (!defined($a[0]) || !defined($a[1])) {
469 push(@s, "$a[0]/$a[1]");
473 print join("\t", @s), "\n";
480 Usage: vcfutils.pl <command> [<arguments>]\n
481 Command: subsam get a subset of samples
482 listsam list the samples
483 fillac fill the allele count field
484 qstats SNP stats stratified by QUAL
485 varFilter filtering short variants
486 filter4vcf filtering VCFs produced by samtools+bcftools
487 hapmap2vcf convert the hapmap format to VCF
488 ucscsnp2vcf convert UCSC SNP SQL dump to VCF