13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
203 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -q INT minimum RMS mapping quality for gaps [$opts{q}]
209 -d INT minimum read depth [$opts{d}]
210 -D INT maximum read depth [$opts{D}]
212 -G INT min indel score for nearby SNP filtering [$opts{G}]
213 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
215 -W INT window size for filtering dense SNPs [$opts{W}]
216 -N INT max number of SNPs in a window [$opts{N}]
218 -l INT window size for filtering adjacent gaps [$opts{l}]
220 -p print filtered variants
221 \n/) if (@ARGV == 0 && -t STDIN);
223 # calculate the window size
224 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
225 my $max_dist = $ol > $ow? $ol : $ow;
226 $max_dist = $oW if ($max_dist < $oW);
228 my @staging; # (indel_filtering_score, flt_tag)
232 next if ($t[4] eq '.'); # skip non-var sites
234 if (length($t[3]) > 1) {
237 my @s = split(',', $t[4]);
239 $is_snp = 0 if (length > 1);
242 # clear the out-of-range elements
244 # Still on the same chromosome and the first element's window still affects this position?
245 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
246 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
248 my ($flt, $score) = (0, -1);
250 # collect key annotations
251 my ($dp, $mq, $af) = (-1, -1, 1);
252 if ($t[7] =~ /DP=(\d+)/i) {
254 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
255 $dp = $1 + $2 + $3 + $4;
257 if ($t[7] =~ /MQ=(\d+)/i) {
260 if ($t[7] =~ /AF=([^\s;=]+)/i) {
262 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
267 if ($dp < $opts{d}) {
269 } elsif ($dp > $opts{D}) {
274 # site dependent filters
277 if (!$is_snp) { # an indel
278 # If deletion, remember the length of the deletion
279 $dlen = length($t[3]) - 1;
280 $flt = 1 if ($mq < $opts{q});
282 if ($t[5] >= $opts{G}) {
283 for my $x (@staging) {
284 # Is it a SNP and is it outside the SNP filter window?
285 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
286 $x->[1] = 5 if ($x->[1] == 0);
289 # the indel filtering score
291 # check the staging list for indel filtering
292 for my $x (@staging) {
293 # Is it a SNP and is it outside the gap filter window
294 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
295 if ($x->[0] < $score) {
302 $flt = 1 if ($mq < $opts{Q});
303 # check adjacent SNPs
305 for my $x (@staging) {
306 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
308 # filtering is necessary
311 for my $x (@staging) {
312 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
314 } else { # then check gap filter
315 for my $x (@staging) {
316 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
317 if ($x->[0] >= $opts{G}) {
324 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
326 # output the last few elements in the staging list
328 varFilter_aux(shift @staging, $opts{p});
333 my ($first, $is_print) = @_;
334 if ($first->[1] == 0) {
335 print join("\t", @$first[3 .. @$first-1]), "\n";
336 } elsif ($is_print) {
337 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
342 my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
343 getopts('d:D:1:2:3:4:Q:q:', \%opts);
345 Usage: vcfutils.pl filter4vcf [options] <in.vcf>
347 Options: -d INT min total depth (given DP or DP4) [$opts{d}]
348 -D INT max total depth [$opts{D}]
349 -q INT min SNP quality [$opts{q}]
350 -Q INT min RMS mapQ (given MQ) [$opts{Q}]
351 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
352 -2 FLOAT min P-value for baseQ bias [$opts{2}]
353 -3 FLOAT min P-value for mapQ bias [$opts{3}]
354 -4 FLOAT min P-value for end distance bias [$opts{4}]\n
355 /) if (@ARGV == 0 && -t STDIN);
357 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
365 next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
367 $depth = $1 if (/DP=(\d+)/);
368 $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
369 next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
370 next if (/MQ=(\d+)/ && $1 < $opts{Q});
372 next if ($t[5] >= 0 && $t[5] < $opts{q});
374 my @s = split(',', $t[4]);
375 ++$n[1] if ($ts{$t[3].$s[0]});
381 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
382 print "##fileformat=VCFv4.0\n";
383 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
386 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
391 $t[9] = reverse($t[9]);
392 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
394 my @a = split("/", $t[9]);
396 push(@alt, $_) if ($_ ne $alt[0]);
402 $alt[$_] = "N$alt[$_]";
405 my $ref = shift(@alt);
406 my $af = $t[13] > 0? ";AF=$t[13]" : '';
407 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
408 my $info = "molType=$t[10];class=$t[11]$valid$af";
409 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
414 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
415 my $fn = shift(@ARGV);
417 warn("Parsing UCSC SNPs...\n");
419 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
422 next if ($t[3] - $t[2] != 1); # not SNP
423 @{$map{$t[4]}} = @t[1,3,7];
427 warn("Writing VCF...\n");
428 print "##fileformat=VCFv4.0\n";
431 if ($t[0] eq 'rs#') { # the first line
432 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
434 next unless ($map{$t[0]});
435 next if (length($t[1]) != 3); # skip non-SNPs
436 my $a = \@{$map{$t[0]}};
438 my @u = split('/', $t[1]);
440 $u[1] = $u[0]; $u[0] = $ref;
441 } elsif ($u[0] ne $ref) { next; }
444 $w{$u[0]} = 0; $w{$u[1]} = 1;
445 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
451 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
452 if (!defined($a[0]) || !defined($a[1])) {
456 push(@s, "$a[0]/$a[1]");
460 print join("\t", @s), "\n";
467 Usage: vcfutils.pl <command> [<arguments>]\n
468 Command: subsam get a subset of samples
469 listsam list the samples
470 fillac fill the allele count field
471 qstats SNP stats stratified by QUAL
472 varFilter filtering short variants
473 filter4vcf filtering VCFs produced by samtools+bcftools
474 hapmap2vcf convert the hapmap format to VCF
475 ucscsnp2vcf convert UCSC SNP SQL dump to VCF