13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats);
17 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
22 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
24 my $fn = shift(@ARGV);
26 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
27 $h{$_} = 1 for (@ARGV);
33 my @s = @t[0..8]; # all fixed fields + FORMAT
40 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
41 print join("\t", @s), "\n";
45 print join("\t", @t[0..7]), "\n";
47 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
55 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
59 print join("\n", @t[9..$#t]), "\n";
66 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
75 @_ = split(":", $t[8]);
77 if ($_[$_] eq 'GT') { $s = $_; last; }
80 print join("\t", @t), "\n";
84 if ($t[$_] =~ /^0,0,0/) {
85 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
90 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
92 $info =~ s/(;?)AC=(\d+)//;
93 $info =~ s/(;?)AN=(\d+)//;
100 print join("\t", @t), "\n";
107 getopts('t:', \%opts);
108 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
109 my $cutoff = $opts{t};
110 my ($last, $lastchr) = (0x7fffffff, '');
111 my ($x, $y, $n) = (0, 0, 0);
113 if (/^([^#\s]+)\s(\d+)/) {
114 my ($chr, $pos) = ($1, $2);
115 if (/NEIR=([\d\.]+)/) {
117 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
119 $last = $pos; $lastchr = $chr;
122 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
123 print "Fraction: ", $y/$n, "\n";
124 print "Length: $x\n";
128 my %opts = (r=>'', s=>0.02, v=>undef);
129 getopts('r:s:v', \%opts);
130 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
131 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
132 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
134 my $is_vcf = defined($opts{v})? 1 : 0;
135 if ($opts{r}) { # read the reference positions
137 open($fh, $opts{r}) || die;
142 $h{$t[0],$t[1]} = $t[4];
144 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
149 my $hsize = scalar(keys %h);
154 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
155 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
156 my @s = split(',', $t[4]);
157 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
158 next if (length($s[0]) != 1);
162 my $aa = $h{$t[0],$t[1]};
164 my @aaa = split(",", $aa);
166 $hit = 1 if ($_ eq $s[0]);
170 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
172 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
174 push(@a, [-1, 0, 0, 0]); # end marker
175 die("[qstats] No SNP data!\n") if (@a == 0);
176 @a = sort {$b->[0]<=>$a->[0]} @a;
179 my @c = (0, 0, 0, 0);
183 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
185 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
186 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
187 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
188 my $a = $c[1] - $lc[1];
189 my $b = $c[2] - $lc[2];
190 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
191 print join("\t", $last, @c, @x), "\n";
192 $next = $c[0]/@a + $opts{s};
193 $lc[1] = $c[1]; $lc[2] = $c[2];
195 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
201 my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
202 getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
204 Usage: vcfutils.pl varFilter [options] <in.vcf>
206 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
207 -d INT minimum read depth [$opts{d}]
208 -D INT maximum read depth [$opts{D}]
209 -a INT minimum number of alternate bases [$opts{a}]
210 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
211 -W INT window size for filtering adjacent gaps [$opts{W}]
212 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
213 -2 FLOAT min P-value for baseQ bias [$opts{2}]
214 -3 FLOAT min P-value for mapQ bias [$opts{3}]
215 -4 FLOAT min P-value for end distance bias [$opts{4}]
216 -p print filtered variants
218 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
219 \n/) if (@ARGV == 0 && -t STDIN);
221 # calculate the window size
222 my ($ol, $ow) = ($opts{W}, $opts{w});
223 my $max_dist = $ol > $ow? $ol : $ow;
225 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
231 next if ($t[4] eq '.'); # skip non-var sites
232 # check if the site is a SNP
234 if (length($t[3]) > 1) {
237 my @s = split(',', $t[4]);
239 $is_snp = 0 if (length > 1);
242 # clear the out-of-range elements
244 # Still on the same chromosome and the first element's window still affects this position?
245 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
246 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
250 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
251 if ($t[7] =~ /DP=(\d+)/i) {
253 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
254 $dp = $1 + $2 + $3 + $4;
257 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
258 # the depth and mapQ filter
260 if ($dp < $opts{d}) {
262 } elsif ($dp > $opts{D}) {
266 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
267 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
268 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
269 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
271 # site dependent filters
272 my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
274 if (!$is_snp) { # an indel
275 $rlen = length($t[3]) - 1;
276 $indel_score = $t[5] * 100 + $dp_alt;
278 for my $x (@staging) {
279 next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
282 # check the staging list for indel filtering
283 for my $x (@staging) {
284 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
285 if ($x->[0] < $indel_score) {
292 for my $x (@staging) {
293 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
299 push(@staging, [$indel_score, $flt, $rlen, @t]);
301 # output the last few elements in the staging list
303 varFilter_aux(shift @staging, $opts{p});
308 my ($first, $is_print) = @_;
309 if ($first->[1] == 0) {
310 print join("\t", @$first[3 .. @$first-1]), "\n";
311 } elsif ($is_print) {
312 print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
317 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
318 print "##fileformat=VCFv4.0\n";
319 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
322 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
327 $t[9] = reverse($t[9]);
328 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
330 my @a = split("/", $t[9]);
332 push(@alt, $_) if ($_ ne $alt[0]);
338 $alt[$_] = "N$alt[$_]";
341 my $ref = shift(@alt);
342 my $af = $t[13] > 0? ";AF=$t[13]" : '';
343 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
344 my $info = "molType=$t[10];class=$t[11]$valid$af";
345 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
350 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
351 my $fn = shift(@ARGV);
353 warn("Parsing UCSC SNPs...\n");
355 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
358 next if ($t[3] - $t[2] != 1); # not SNP
359 @{$map{$t[4]}} = @t[1,3,7];
363 warn("Writing VCF...\n");
364 print "##fileformat=VCFv4.0\n";
367 if ($t[0] eq 'rs#') { # the first line
368 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
370 next unless ($map{$t[0]});
371 next if (length($t[1]) != 3); # skip non-SNPs
372 my $a = \@{$map{$t[0]}};
374 my @u = split('/', $t[1]);
376 $u[1] = $u[0]; $u[0] = $ref;
377 } elsif ($u[0] ne $ref) { next; }
380 $w{$u[0]} = 0; $w{$u[1]} = 1;
381 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
387 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
388 if (!defined($a[0]) || !defined($a[1])) {
392 push(@s, "$a[0]/$a[1]");
396 print join("\t", @s), "\n";
403 Usage: vcfutils.pl <command> [<arguments>]\n
404 Command: subsam get a subset of samples
405 listsam list the samples
406 fillac fill the allele count field
407 qstats SNP stats stratified by QUAL
408 varFilter filtering short variants
409 hapmap2vcf convert the hapmap format to VCF
410 ucscsnp2vcf convert UCSC SNP SQL dump to VCF