13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
77 if ($t[$_] =~ /^(\d+).(\d+)/) {
82 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
84 $info =~ s/(;?)AC=(\d+)//;
85 $info =~ s/(;?)AN=(\d+)//;
92 print join("\t", @t), "\n";
98 my %opts = (r=>'', s=>0.01, v=>undef);
99 getopts('r:s:v', \%opts);
100 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
101 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
102 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
104 my $is_vcf = defined($opts{v})? 1 : 0;
105 if ($opts{r}) { # read the reference positions
107 open($fh, $opts{r}) || die;
112 $h{$t[0],$t[1]} = $t[4];
114 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
119 my $hsize = scalar(keys %h);
124 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
125 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
126 my @s = split(',', $t[4]);
127 $t[5] = 3 if ($t[5] < 0);
128 next if (length($s[0]) != 1);
132 my $aa = $h{$t[0],$t[1]};
134 my @aaa = split(",", $aa);
136 $hit = 1 if ($_ eq $s[0]);
140 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
142 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
144 push(@a, [-1, 0, 0, 0]); # end marker
145 die("[qstats] No SNP data!\n") if (@a == 0);
146 @a = sort {$b->[0]<=>$a->[0]} @a;
149 my @c = (0, 0, 0, 0);
151 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
153 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
154 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
155 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
156 print join("\t", $last, @c, @x), "\n";
157 $next = $c[0]/@a + $opts{s};
159 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
165 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
166 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
168 Usage: vcfutils.pl varFilter [options] <in.vcf>
170 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
171 -q INT minimum RMS mapping quality for gaps [$opts{q}]
172 -d INT minimum read depth [$opts{d}]
173 -D INT maximum read depth [$opts{D}]
175 -G INT min indel score for nearby SNP filtering [$opts{G}]
176 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
178 -W INT window size for filtering dense SNPs [$opts{W}]
179 -N INT max number of SNPs in a window [$opts{N}]
181 -l INT window size for filtering adjacent gaps [$opts{l}]
183 -p print filtered variants
184 \n/) if (@ARGV == 0 && -t STDIN);
186 # calculate the window size
187 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
188 my $max_dist = $ol > $ow? $ol : $ow;
189 $max_dist = $oW if ($max_dist < $oW);
191 my @staging; # (indel_filtering_score, flt_tag)
195 next if ($t[4] eq '.'); # skip non-var sites
197 if (length($t[3]) > 1) {
200 my @s = split(',', $t[4]);
202 $is_snp = 0 if (length > 1);
205 # clear the out-of-range elements
207 # Still on the same chromosome and the first element's window still affects this position?
208 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
209 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
211 my ($flt, $score) = (0, -1);
213 # collect key annotations
214 my ($dp, $mq, $af) = (-1, -1, 1);
215 if ($t[7] =~ /DP=(\d+)/i) {
217 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
218 $dp = $1 + $2 + $3 + $4;
220 if ($t[7] =~ /MQ=(\d+)/i) {
223 if ($t[7] =~ /AF=([^\s;=]+)/i) {
225 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
230 if ($dp < $opts{d}) {
232 } elsif ($dp > $opts{D}) {
237 # site dependent filters
240 if (!$is_snp) { # an indel
241 # If deletion, remember the length of the deletion
242 $dlen = length($t[3]) - 1;
243 $flt = 1 if ($mq < $opts{q});
245 if ($t[5] >= $opts{G}) {
246 for my $x (@staging) {
247 # Is it a SNP and is it outside the SNP filter window?
248 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
249 $x->[1] = 5 if ($x->[1] == 0);
252 # the indel filtering score
254 # check the staging list for indel filtering
255 for my $x (@staging) {
256 # Is it a SNP and is it outside the gap filter window
257 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
258 if ($x->[0] < $score) {
265 $flt = 1 if ($mq < $opts{Q});
266 # check adjacent SNPs
268 for my $x (@staging) {
269 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
271 # filtering is necessary
274 for my $x (@staging) {
275 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
277 } else { # then check gap filter
278 for my $x (@staging) {
279 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
280 if ($x->[0] >= $opts{G}) {
287 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
289 # output the last few elements in the staging list
291 varFilter_aux(shift @staging, $opts{p});
296 my ($first, $is_print) = @_;
297 if ($first->[1] == 0) {
298 print join("\t", @$first[3 .. @$first-1]), "\n";
299 } elsif ($is_print) {
300 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
305 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
306 print "##fileformat=VCFv4.0\n";
307 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
310 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
315 $t[9] = reverse($t[9]);
316 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
318 my @a = split("/", $t[9]);
320 push(@alt, $_) if ($_ ne $alt[0]);
326 $alt[$_] = "N$alt[$_]";
329 my $ref = shift(@alt);
330 my $af = $t[13] > 0? ";AF=$t[13]" : '';
331 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
332 my $info = "molType=$t[10];class=$t[11]$valid$af";
333 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
338 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
339 my $fn = shift(@ARGV);
341 warn("Parsing UCSC SNPs...\n");
343 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
346 next if ($t[3] - $t[2] != 1); # not SNP
347 @{$map{$t[4]}} = @t[1,3,7];
351 warn("Writing VCF...\n");
352 print "##fileformat=VCFv4.0\n";
355 if ($t[0] eq 'rs#') { # the first line
356 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
358 next unless ($map{$t[0]});
359 next if (length($t[1]) != 3); # skip non-SNPs
360 my $a = \@{$map{$t[0]}};
362 my @u = split('/', $t[1]);
364 $u[1] = $u[0]; $u[0] = $ref;
365 } elsif ($u[0] ne $ref) { next; }
368 $w{$u[0]} = 0; $w{$u[1]} = 1;
369 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
375 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
376 if (!defined($a[0]) || !defined($a[1])) {
380 push(@s, "$a[0]/$a[1]");
384 print join("\t", @s), "\n";
391 Usage: vcfutils.pl <command> [<arguments>]\n
392 Command: subsam get a subset of samples
393 listsam list the samples
394 fillac fill the allele count field
395 qstats SNP stats stratified by QUAL
396 varFilter filtering short variants
397 hapmap2vcf convert the hapmap format to VCF
398 ucscsnp2vcf convert UCSC SNP SQL dump to VCF