13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
77 if ($t[$_] =~ /^(\d+).(\d+)/) {
82 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
84 $info =~ s/(;?)AC=(\d+)//;
85 $info =~ s/(;?)AN=(\d+)//;
92 print join("\t", @t), "\n";
98 my %opts = (r=>'', s=>0.01);
99 getopts('r:s:', \%opts);
100 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
101 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
102 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
104 if ($opts{r}) { # read the reference positions
106 open($fh, $opts{r}) || die;
109 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
113 my $hsize = scalar(keys %h);
118 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
119 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
120 my @s = split(',', $t[4]);
121 $t[5] = 3 if ($t[5] < 0);
122 next if (length($s[0]) != 1);
123 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $h{$t[0],$t[1]}? 1 : 0]);
125 push(@a, [-1, 0, 0, 0]); # end marker
126 die("[qstats] No SNP data!\n") if (@a == 0);
127 @a = sort {$b->[0]<=>$a->[0]} @a;
130 my @c = (0, 0, 0, 0);
132 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
134 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
135 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
136 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
137 print join("\t", $last, @c, @x), "\n";
138 $next = $c[0]/@a + $opts{s};
140 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
146 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
147 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
149 Usage: vcfutils.pl varFilter [options] <in.vcf>
151 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
152 -q INT minimum RMS mapping quality for gaps [$opts{q}]
153 -d INT minimum read depth [$opts{d}]
154 -D INT maximum read depth [$opts{D}]
156 -G INT min indel score for nearby SNP filtering [$opts{G}]
157 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
159 -W INT window size for filtering dense SNPs [$opts{W}]
160 -N INT max number of SNPs in a window [$opts{N}]
162 -l INT window size for filtering adjacent gaps [$opts{l}]
164 -p print filtered variants
165 \n/) if (@ARGV == 0 && -t STDIN);
167 # calculate the window size
168 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
169 my $max_dist = $ol > $ow? $ol : $ow;
170 $max_dist = $oW if ($max_dist < $oW);
172 my @staging; # (indel_filtering_score, flt_tag)
176 next if ($t[4] eq '.'); # skip non-var sites
178 if (length($t[3]) > 1) {
181 my @s = split(',', $t[4]);
183 $is_snp = 0 if (length > 1);
186 # clear the out-of-range elements
188 # Still on the same chromosome and the first element's window still affects this position?
189 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
190 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
192 my ($flt, $score) = (0, -1);
194 # collect key annotations
195 my ($dp, $mq, $af) = (-1, -1, 1);
196 if ($t[7] =~ /DP=(\d+)/i) {
198 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
199 $dp = $1 + $2 + $3 + $4;
201 if ($t[7] =~ /MQ=(\d+)/i) {
204 if ($t[7] =~ /AF=([^\s;=]+)/i) {
206 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
211 if ($dp < $opts{d}) {
213 } elsif ($dp > $opts{D}) {
218 # site dependent filters
221 if (!$is_snp) { # an indel
222 # If deletion, remember the length of the deletion
223 $dlen = length($t[3]) - 1;
224 $flt = 1 if ($mq < $opts{q});
226 if ($t[5] >= $opts{G}) {
227 for my $x (@staging) {
228 # Is it a SNP and is it outside the SNP filter window?
229 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
230 $x->[1] = 5 if ($x->[1] == 0);
233 # the indel filtering score
235 # check the staging list for indel filtering
236 for my $x (@staging) {
237 # Is it a SNP and is it outside the gap filter window
238 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
239 if ($x->[0] < $score) {
246 $flt = 1 if ($mq < $opts{Q});
247 # check adjacent SNPs
249 for my $x (@staging) {
250 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
252 # filtering is necessary
255 for my $x (@staging) {
256 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
258 } else { # then check gap filter
259 for my $x (@staging) {
260 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
261 if ($x->[0] >= $opts{G}) {
268 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
270 # output the last few elements in the staging list
272 varFilter_aux(shift @staging, $opts{p});
277 my ($first, $is_print) = @_;
278 if ($first->[1] == 0) {
279 print join("\t", @$first[3 .. @$first-1]), "\n";
280 } elsif ($is_print) {
281 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
286 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
287 print "##fileformat=VCFv4.0\n";
288 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
291 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
296 $t[9] = reverse($t[9]);
297 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
299 my @a = split("/", $t[9]);
301 push(@alt, $_) if ($_ ne $alt[0]);
307 $alt[$_] = "N$alt[$_]";
310 my $ref = shift(@alt);
311 my $af = $t[13] > 0? ";AF=$t[13]" : '';
312 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
313 my $info = "molType=$t[10];class=$t[11]$valid$af";
314 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
319 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
320 my $fn = shift(@ARGV);
322 warn("Parsing UCSC SNPs...\n");
324 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
327 next if ($t[3] - $t[2] != 1); # not SNP
328 @{$map{$t[4]}} = @t[1,3,7];
332 warn("Writing VCF...\n");
333 print "##fileformat=VCFv4.0\n";
336 if ($t[0] eq 'rs#') { # the first line
337 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
339 next unless ($map{$t[0]});
340 next if (length($t[1]) != 3); # skip non-SNPs
341 my $a = \@{$map{$t[0]}};
343 my @u = split('/', $t[1]);
345 $u[1] = $u[0]; $u[0] = $ref;
346 } elsif ($u[0] ne $ref) { next; }
349 $w{$u[0]} = 0; $w{$u[1]} = 1;
350 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
356 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
357 if (!defined($a[0]) || !defined($a[1])) {
361 push(@s, "$a[0]/$a[1]");
365 print join("\t", @s), "\n";
372 Usage: vcfutils.pl <command> [<arguments>]\n
373 Command: subsam get a subset of samples
374 listsam list the samples
375 fillac fill the allele count field
376 qstats SNP stats stratified by QUAL
377 varFilter filtering short variants
378 hapmap2vcf convert the hapmap format to VCF
379 ucscsnp2vcf convert UCSC SNP SQL dump to VCF