13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 my %opts = (l=>5000000);
24 getopts('l:', \%opts);
26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
30 for (my $i = 0; $i < $t[1];) {
31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
32 print "$t[0]:".($i+1)."-$e\n";
39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
41 my $fn = shift(@ARGV);
43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
44 $h{$_} = 1 for (@ARGV);
50 my @s = @t[0..8]; # all fixed fields + FORMAT
57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
58 print join("\t", @s), "\n";
62 print join("\t", @t[0..7]), "\n";
64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
76 print join("\n", @t[9..$#t]), "\n";
83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
92 @_ = split(":", $t[8]);
94 if ($_[$_] eq 'GT') { $s = $_; last; }
97 print join("\t", @t), "\n";
101 if ($t[$_] =~ /^0,0,0/) {
102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
109 $info =~ s/(;?)AC=(\d+)//;
110 $info =~ s/(;?)AN=(\d+)//;
117 print join("\t", @t), "\n";
124 getopts('t:', \%opts);
125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
126 my $cutoff = $opts{t};
127 my ($last, $lastchr) = (0x7fffffff, '');
128 my ($x, $y, $n) = (0, 0, 0);
130 if (/^([^#\s]+)\s(\d+)/) {
131 my ($chr, $pos) = ($1, $2);
132 if (/NEIR=([\d\.]+)/) {
134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
136 $last = $pos; $lastchr = $chr;
139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
140 print "Fraction: ", $y/$n, "\n";
141 print "Length: $x\n";
145 my %opts = (r=>'', s=>0.02, v=>undef);
146 getopts('r:s:v', \%opts);
147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
151 my $is_vcf = defined($opts{v})? 1 : 0;
152 if ($opts{r}) { # read the reference positions
154 open($fh, $opts{r}) || die;
159 $h{$t[0],$t[1]} = $t[4];
161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
166 my $hsize = scalar(keys %h);
171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
172 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
173 my @s = split(',', $t[4]);
174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
175 next if (length($s[0]) != 1);
179 my $aa = $h{$t[0],$t[1]};
181 my @aaa = split(",", $aa);
183 $hit = 1 if ($_ eq $s[0]);
187 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
191 push(@a, [-1, 0, 0, 0]); # end marker
192 die("[qstats] No SNP data!\n") if (@a == 0);
193 @a = sort {$b->[0]<=>$a->[0]} @a;
196 my @c = (0, 0, 0, 0);
200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
204 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
205 my $a = $c[1] - $lc[1];
206 my $b = $c[2] - $lc[2];
207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
208 print join("\t", $last, @c, @x), "\n";
209 $next = $c[0]/@a + $opts{s};
210 $lc[1] = $c[1]; $lc[2] = $c[2];
212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
218 my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
219 getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
221 Usage: vcfutils.pl varFilter [options] <in.vcf>
223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
224 -d INT minimum read depth [$opts{d}]
225 -D INT maximum read depth [$opts{D}]
226 -a INT minimum number of alternate bases [$opts{a}]
227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
228 -W INT window size for filtering adjacent gaps [$opts{W}]
229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
230 -2 FLOAT min P-value for baseQ bias [$opts{2}]
231 -3 FLOAT min P-value for mapQ bias [$opts{3}]
232 -4 FLOAT min P-value for end distance bias [$opts{4}]
233 -p print filtered variants
235 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
236 \n/) if (@ARGV == 0 && -t STDIN);
238 # calculate the window size
239 my ($ol, $ow) = ($opts{W}, $opts{w});
240 my $max_dist = $ol > $ow? $ol : $ow;
242 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
248 next if ($t[4] eq '.'); # skip non-var sites
249 # check if the site is a SNP
251 if (length($t[3]) > 1) {
253 my @s = split(',', $t[4]);
255 $type = 3 if (length != length($t[3]));
258 my @s = split(',', $t[4]);
260 $type = 3 if (length > 1);
263 # clear the out-of-range elements
265 # Still on the same chromosome and the first element's window still affects this position?
266 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
267 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
271 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
272 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
273 $dp = $1 + $2 + $3 + $4;
276 if ($t[7] =~ /DP=(\d+)/i) {
279 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
280 # the depth and mapQ filter
282 if ($dp < $opts{d}) {
284 } elsif ($dp > $opts{D}) {
288 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
289 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
290 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
291 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
293 my $score = $t[5] * 100 + $dp_alt;
294 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
296 if ($type == 3) { # an indel
297 # filtering SNPs and MNPs
298 for my $x (@staging) {
299 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
302 # check the staging list for indel filtering
303 for my $x (@staging) {
304 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
305 if ($x->[0]>>2 < $score) {
311 } else { # SNP or MNP
312 for my $x (@staging) {
313 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
318 for my $x (@staging) {
319 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
320 if ($x->[0]>>2 < $score) {
328 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
330 # output the last few elements in the staging list
332 varFilter_aux(shift @staging, $opts{p});
337 my ($first, $is_print) = @_;
338 if ($first->[1] == 0) {
339 print join("\t", @$first[3 .. @$first-1]), "\n";
340 } elsif ($is_print) {
341 print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
347 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
351 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
352 my @s = split(',', $t[4]);
354 my $l = length($x) - length($t[3]) + 5000;
356 $l = -(length($x) - 1) + 5000;
357 } elsif ($x =~ /^\+/) {
358 $l = length($x) - 1 + 5000;
363 for (my $i = 0; $i < 10000; ++$i) {
364 next if ($c0[$i] == 0);
366 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
367 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
369 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
373 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
374 print "##fileformat=VCFv4.0\n";
375 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
378 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
383 $t[9] = reverse($t[9]);
384 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
386 my @a = split("/", $t[9]);
388 push(@alt, $_) if ($_ ne $alt[0]);
394 $alt[$_] = "N$alt[$_]";
397 my $ref = shift(@alt);
398 my $af = $t[13] > 0? ";AF=$t[13]" : '';
399 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
400 my $info = "molType=$t[10];class=$t[11]$valid$af";
401 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
406 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
407 my $fn = shift(@ARGV);
409 warn("Parsing UCSC SNPs...\n");
411 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
414 next if ($t[3] - $t[2] != 1); # not SNP
415 @{$map{$t[4]}} = @t[1,3,7];
419 warn("Writing VCF...\n");
420 print "##fileformat=VCFv4.0\n";
423 if ($t[0] eq 'rs#') { # the first line
424 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
426 next unless ($map{$t[0]});
427 next if (length($t[1]) != 3); # skip non-SNPs
428 my $a = \@{$map{$t[0]}};
430 my @u = split('/', $t[1]);
432 $u[1] = $u[0]; $u[0] = $ref;
433 } elsif ($u[0] ne $ref) { next; }
436 $w{$u[0]} = 0; $w{$u[1]} = 1;
437 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
443 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
444 if (!defined($a[0]) || !defined($a[1])) {
448 push(@s, "$a[0]/$a[1]");
452 print join("\t", @s), "\n";
458 my %opts = (d=>3, D=>100000, Q=>10, l=>5);
459 getopts('d:D:Q:l:', \%opts);
461 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
463 Options: -d INT minimum depth [$opts{d}]
464 -D INT maximum depth [$opts{D}]
465 -Q INT min RMS mapQ [$opts{Q}]
466 -l INT INDEL filtering window [$opts{l}]
467 \n/) if (@ARGV == 0 && -t STDIN);
469 my ($last_chr, $seq, $qual, $last_pos, @gaps);
474 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
475 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
481 if ($last_chr ne $t[0]) {
482 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
483 ($last_chr, $last_pos) = ($t[0], 0);
487 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
488 if ($t[1] - $last_pos > 1) {
489 $seq .= 'n' x ($t[1] - $last_pos - 1);
490 $qual .= '!' x ($t[1] - $last_pos - 1);
492 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
493 my ($ref, $alt) = ($t[3], $1);
495 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
497 $_ = $1 if ($t[7] =~ /AF1=([\d\.]+)/);
498 $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
501 $b = $het{"$ref$alt"};
505 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
506 $q = int($q + 33 + .499);
507 $q = chr($q <= 126? $q : 126);
511 push(@gaps, [$t[1], length($t[3])]);
515 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
518 sub v2q_post_process {
519 my ($chr, $seq, $qual, $gaps, $l) = @_;
521 my $beg = $g->[0] > $l? $g->[0] - $l : 0;
522 my $end = $g->[0] + $g->[1] + $l;
523 $end = length($$seq) if ($end > length($$seq));
524 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
526 print "\@$chr\n"; &v2q_print_str($seq);
527 print "+\n"; &v2q_print_str($qual);
533 for (my $i = 0; $i < $l; $i += 60) {
534 print substr($$s, $i, 60), "\n";
540 Usage: vcfutils.pl <command> [<arguments>]\n
541 Command: subsam get a subset of samples
542 listsam list the samples
543 fillac fill the allele count field
544 qstats SNP stats stratified by QUAL
546 hapmap2vcf convert the hapmap format to VCF
547 ucscsnp2vcf convert UCSC SNP SQL dump to VCF
549 varFilter filtering short variants (*)
550 vcf2fq VCF->fastq (**)
552 Notes: Commands with description endting with (*) may need bcftools
553 specific annotations.