13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
107 my %opts = (r=>'', s=>0.02, v=>undef);
108 getopts('r:s:v', \%opts);
109 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
110 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
111 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
113 my $is_vcf = defined($opts{v})? 1 : 0;
114 if ($opts{r}) { # read the reference positions
116 open($fh, $opts{r}) || die;
121 $h{$t[0],$t[1]} = $t[4];
123 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
128 my $hsize = scalar(keys %h);
133 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
134 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
135 my @s = split(',', $t[4]);
136 $t[5] = 3 if ($t[5] < 0);
137 next if (length($s[0]) != 1);
141 my $aa = $h{$t[0],$t[1]};
143 my @aaa = split(",", $aa);
145 $hit = 1 if ($_ eq $s[0]);
149 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
151 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
153 push(@a, [-1, 0, 0, 0]); # end marker
154 die("[qstats] No SNP data!\n") if (@a == 0);
155 @a = sort {$b->[0]<=>$a->[0]} @a;
158 my @c = (0, 0, 0, 0);
162 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
164 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
165 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
166 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
167 my $a = $c[1] - $lc[1];
168 my $b = $c[2] - $lc[2];
169 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
170 print join("\t", $last, @c, @x), "\n";
171 $next = $c[0]/@a + $opts{s};
172 $lc[1] = $c[1]; $lc[2] = $c[2];
174 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
180 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
181 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
183 Usage: vcfutils.pl varFilter [options] <in.vcf>
185 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
186 -q INT minimum RMS mapping quality for gaps [$opts{q}]
187 -d INT minimum read depth [$opts{d}]
188 -D INT maximum read depth [$opts{D}]
190 -G INT min indel score for nearby SNP filtering [$opts{G}]
191 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
193 -W INT window size for filtering dense SNPs [$opts{W}]
194 -N INT max number of SNPs in a window [$opts{N}]
196 -l INT window size for filtering adjacent gaps [$opts{l}]
198 -p print filtered variants
199 \n/) if (@ARGV == 0 && -t STDIN);
201 # calculate the window size
202 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
203 my $max_dist = $ol > $ow? $ol : $ow;
204 $max_dist = $oW if ($max_dist < $oW);
206 my @staging; # (indel_filtering_score, flt_tag)
210 next if ($t[4] eq '.'); # skip non-var sites
212 if (length($t[3]) > 1) {
215 my @s = split(',', $t[4]);
217 $is_snp = 0 if (length > 1);
220 # clear the out-of-range elements
222 # Still on the same chromosome and the first element's window still affects this position?
223 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
224 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
226 my ($flt, $score) = (0, -1);
228 # collect key annotations
229 my ($dp, $mq, $af) = (-1, -1, 1);
230 if ($t[7] =~ /DP=(\d+)/i) {
232 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
233 $dp = $1 + $2 + $3 + $4;
235 if ($t[7] =~ /MQ=(\d+)/i) {
238 if ($t[7] =~ /AF=([^\s;=]+)/i) {
240 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
245 if ($dp < $opts{d}) {
247 } elsif ($dp > $opts{D}) {
252 # site dependent filters
255 if (!$is_snp) { # an indel
256 # If deletion, remember the length of the deletion
257 $dlen = length($t[3]) - 1;
258 $flt = 1 if ($mq < $opts{q});
260 if ($t[5] >= $opts{G}) {
261 for my $x (@staging) {
262 # Is it a SNP and is it outside the SNP filter window?
263 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
264 $x->[1] = 5 if ($x->[1] == 0);
267 # the indel filtering score
269 # check the staging list for indel filtering
270 for my $x (@staging) {
271 # Is it a SNP and is it outside the gap filter window
272 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
273 if ($x->[0] < $score) {
280 $flt = 1 if ($mq < $opts{Q});
281 # check adjacent SNPs
283 for my $x (@staging) {
284 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
286 # filtering is necessary
289 for my $x (@staging) {
290 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
292 } else { # then check gap filter
293 for my $x (@staging) {
294 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
295 if ($x->[0] >= $opts{G}) {
302 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
304 # output the last few elements in the staging list
306 varFilter_aux(shift @staging, $opts{p});
311 my ($first, $is_print) = @_;
312 if ($first->[1] == 0) {
313 print join("\t", @$first[3 .. @$first-1]), "\n";
314 } elsif ($is_print) {
315 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
320 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
321 print "##fileformat=VCFv4.0\n";
322 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
325 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
330 $t[9] = reverse($t[9]);
331 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
333 my @a = split("/", $t[9]);
335 push(@alt, $_) if ($_ ne $alt[0]);
341 $alt[$_] = "N$alt[$_]";
344 my $ref = shift(@alt);
345 my $af = $t[13] > 0? ";AF=$t[13]" : '';
346 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
347 my $info = "molType=$t[10];class=$t[11]$valid$af";
348 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
353 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
354 my $fn = shift(@ARGV);
356 warn("Parsing UCSC SNPs...\n");
358 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
361 next if ($t[3] - $t[2] != 1); # not SNP
362 @{$map{$t[4]}} = @t[1,3,7];
366 warn("Writing VCF...\n");
367 print "##fileformat=VCFv4.0\n";
370 if ($t[0] eq 'rs#') { # the first line
371 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
373 next unless ($map{$t[0]});
374 next if (length($t[1]) != 3); # skip non-SNPs
375 my $a = \@{$map{$t[0]}};
377 my @u = split('/', $t[1]);
379 $u[1] = $u[0]; $u[0] = $ref;
380 } elsif ($u[0] ne $ref) { next; }
383 $w{$u[0]} = 0; $w{$u[1]} = 1;
384 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
390 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
391 if (!defined($a[0]) || !defined($a[1])) {
395 push(@s, "$a[0]/$a[1]");
399 print join("\t", @s), "\n";
406 Usage: vcfutils.pl <command> [<arguments>]\n
407 Command: subsam get a subset of samples
408 listsam list the samples
409 fillac fill the allele count field
410 qstats SNP stats stratified by QUAL
411 varFilter filtering short variants
412 hapmap2vcf convert the hapmap format to VCF
413 ucscsnp2vcf convert UCSC SNP SQL dump to VCF