13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
77 if ($t[$_] =~ /^(\d+).(\d+)/) {
82 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
84 $info =~ s/(;?)AC=(\d+)//;
85 $info =~ s/(;?)AN=(\d+)//;
92 print join("\t", @t), "\n";
98 my %opts = (r=>'', s=>0.02, v=>undef);
99 getopts('r:s:v', \%opts);
100 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
101 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
102 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
104 my $is_vcf = defined($opts{v})? 1 : 0;
105 if ($opts{r}) { # read the reference positions
107 open($fh, $opts{r}) || die;
112 $h{$t[0],$t[1]} = $t[4];
114 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
119 my $hsize = scalar(keys %h);
124 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
125 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
126 my @s = split(',', $t[4]);
127 $t[5] = 3 if ($t[5] < 0);
128 next if (length($s[0]) != 1);
132 my $aa = $h{$t[0],$t[1]};
134 my @aaa = split(",", $aa);
136 $hit = 1 if ($_ eq $s[0]);
140 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
142 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
144 push(@a, [-1, 0, 0, 0]); # end marker
145 die("[qstats] No SNP data!\n") if (@a == 0);
146 @a = sort {$b->[0]<=>$a->[0]} @a;
149 my @c = (0, 0, 0, 0);
153 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
155 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
156 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
157 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
158 my $a = $c[1] - $lc[1];
159 my $b = $c[2] - $lc[2];
160 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
161 print join("\t", $last, @c, @x), "\n";
162 $next = $c[0]/@a + $opts{s};
163 $lc[1] = $c[1]; $lc[2] = $c[2];
165 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
171 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
172 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
174 Usage: vcfutils.pl varFilter [options] <in.vcf>
176 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
177 -q INT minimum RMS mapping quality for gaps [$opts{q}]
178 -d INT minimum read depth [$opts{d}]
179 -D INT maximum read depth [$opts{D}]
181 -G INT min indel score for nearby SNP filtering [$opts{G}]
182 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
184 -W INT window size for filtering dense SNPs [$opts{W}]
185 -N INT max number of SNPs in a window [$opts{N}]
187 -l INT window size for filtering adjacent gaps [$opts{l}]
189 -p print filtered variants
190 \n/) if (@ARGV == 0 && -t STDIN);
192 # calculate the window size
193 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
194 my $max_dist = $ol > $ow? $ol : $ow;
195 $max_dist = $oW if ($max_dist < $oW);
197 my @staging; # (indel_filtering_score, flt_tag)
201 next if ($t[4] eq '.'); # skip non-var sites
203 if (length($t[3]) > 1) {
206 my @s = split(',', $t[4]);
208 $is_snp = 0 if (length > 1);
211 # clear the out-of-range elements
213 # Still on the same chromosome and the first element's window still affects this position?
214 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
215 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
217 my ($flt, $score) = (0, -1);
219 # collect key annotations
220 my ($dp, $mq, $af) = (-1, -1, 1);
221 if ($t[7] =~ /DP=(\d+)/i) {
223 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
224 $dp = $1 + $2 + $3 + $4;
226 if ($t[7] =~ /MQ=(\d+)/i) {
229 if ($t[7] =~ /AF=([^\s;=]+)/i) {
231 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
236 if ($dp < $opts{d}) {
238 } elsif ($dp > $opts{D}) {
243 # site dependent filters
246 if (!$is_snp) { # an indel
247 # If deletion, remember the length of the deletion
248 $dlen = length($t[3]) - 1;
249 $flt = 1 if ($mq < $opts{q});
251 if ($t[5] >= $opts{G}) {
252 for my $x (@staging) {
253 # Is it a SNP and is it outside the SNP filter window?
254 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
255 $x->[1] = 5 if ($x->[1] == 0);
258 # the indel filtering score
260 # check the staging list for indel filtering
261 for my $x (@staging) {
262 # Is it a SNP and is it outside the gap filter window
263 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
264 if ($x->[0] < $score) {
271 $flt = 1 if ($mq < $opts{Q});
272 # check adjacent SNPs
274 for my $x (@staging) {
275 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
277 # filtering is necessary
280 for my $x (@staging) {
281 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
283 } else { # then check gap filter
284 for my $x (@staging) {
285 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
286 if ($x->[0] >= $opts{G}) {
293 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
295 # output the last few elements in the staging list
297 varFilter_aux(shift @staging, $opts{p});
302 my ($first, $is_print) = @_;
303 if ($first->[1] == 0) {
304 print join("\t", @$first[3 .. @$first-1]), "\n";
305 } elsif ($is_print) {
306 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
311 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
312 print "##fileformat=VCFv4.0\n";
313 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
316 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
321 $t[9] = reverse($t[9]);
322 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
324 my @a = split("/", $t[9]);
326 push(@alt, $_) if ($_ ne $alt[0]);
332 $alt[$_] = "N$alt[$_]";
335 my $ref = shift(@alt);
336 my $af = $t[13] > 0? ";AF=$t[13]" : '';
337 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
338 my $info = "molType=$t[10];class=$t[11]$valid$af";
339 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
344 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
345 my $fn = shift(@ARGV);
347 warn("Parsing UCSC SNPs...\n");
349 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
352 next if ($t[3] - $t[2] != 1); # not SNP
353 @{$map{$t[4]}} = @t[1,3,7];
357 warn("Writing VCF...\n");
358 print "##fileformat=VCFv4.0\n";
361 if ($t[0] eq 'rs#') { # the first line
362 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
364 next unless ($map{$t[0]});
365 next if (length($t[1]) != 3); # skip non-SNPs
366 my $a = \@{$map{$t[0]}};
368 my @u = split('/', $t[1]);
370 $u[1] = $u[0]; $u[0] = $ref;
371 } elsif ($u[0] ne $ref) { next; }
374 $w{$u[0]} = 0; $w{$u[1]} = 1;
375 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
381 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
382 if (!defined($a[0]) || !defined($a[1])) {
386 push(@s, "$a[0]/$a[1]");
390 print join("\t", @s), "\n";
397 Usage: vcfutils.pl <command> [<arguments>]\n
398 Command: subsam get a subset of samples
399 listsam list the samples
400 fillac fill the allele count field
401 qstats SNP stats stratified by QUAL
402 varFilter filtering short variants
403 hapmap2vcf convert the hapmap format to VCF
404 ucscsnp2vcf convert UCSC SNP SQL dump to VCF