13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 my %opts = (l=>5000000);
24 getopts('l:', \%opts);
26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
30 for (my $i = 0; $i < $t[1];) {
31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
32 print "$t[0]:".($i+1)."-$e\n";
39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
41 my $fn = shift(@ARGV);
43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
44 $h{$_} = 1 for (@ARGV);
50 my @s = @t[0..8]; # all fixed fields + FORMAT
57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
58 print join("\t", @s), "\n";
62 print join("\t", @t[0..7]), "\n";
64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
76 print join("\n", @t[9..$#t]), "\n";
83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
92 @_ = split(":", $t[8]);
94 if ($_[$_] eq 'GT') { $s = $_; last; }
97 print join("\t", @t), "\n";
101 if ($t[$_] =~ /^0,0,0/) {
102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
109 $info =~ s/(;?)AC=(\d+)//;
110 $info =~ s/(;?)AN=(\d+)//;
117 print join("\t", @t), "\n";
124 getopts('t:', \%opts);
125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
126 my $cutoff = $opts{t};
127 my ($last, $lastchr) = (0x7fffffff, '');
128 my ($x, $y, $n) = (0, 0, 0);
130 if (/^([^#\s]+)\s(\d+)/) {
131 my ($chr, $pos) = ($1, $2);
132 if (/NEIR=([\d\.]+)/) {
134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
136 $last = $pos; $lastchr = $chr;
139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
140 print "Fraction: ", $y/$n, "\n";
141 print "Length: $x\n";
145 my %opts = (r=>'', s=>0.02, v=>undef);
146 getopts('r:s:v', \%opts);
147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
151 my $is_vcf = defined($opts{v})? 1 : 0;
152 if ($opts{r}) { # read the reference positions
154 open($fh, $opts{r}) || die;
159 $h{$t[0],$t[1]} = $t[4];
161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
166 my $hsize = scalar(keys %h);
171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
172 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
173 my @s = split(',', $t[4]);
174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
175 next if (length($s[0]) != 1);
179 my $aa = $h{$t[0],$t[1]};
181 my @aaa = split(",", $aa);
183 $hit = 1 if ($_ eq $s[0]);
187 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
191 push(@a, [-1, 0, 0, 0]); # end marker
192 die("[qstats] No SNP data!\n") if (@a == 0);
193 @a = sort {$b->[0]<=>$a->[0]} @a;
196 my @c = (0, 0, 0, 0);
200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
204 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
205 my $a = $c[1] - $lc[1];
206 my $b = $c[2] - $lc[2];
207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
208 print join("\t", $last, @c, @x), "\n";
209 $next = $c[0]/@a + $opts{s};
210 $lc[1] = $c[1]; $lc[2] = $c[2];
212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
218 my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000);
219 getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:', \%opts);
221 Usage: vcfutils.pl varFilter [options] <in.vcf>
223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
224 -d INT minimum read depth [$opts{d}]
225 -D INT maximum read depth [$opts{D}]
226 -a INT minimum number of alternate bases [$opts{a}]
227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
228 -W INT window size for filtering adjacent gaps [$opts{W}]
229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
230 -2 FLOAT min P-value for baseQ bias [$opts{2}]
231 -3 FLOAT min P-value for mapQ bias [$opts{3}]
232 -4 FLOAT min P-value for end distance bias [$opts{4}]
233 -p print filtered variants
235 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
236 \n/) if (@ARGV == 0 && -t STDIN);
238 # calculate the window size
239 my ($ol, $ow) = ($opts{W}, $opts{w});
240 my $max_dist = $ol > $ow? $ol : $ow;
242 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
248 next if ($t[4] eq '.'); # skip non-var sites
249 # check if the site is a SNP
251 if (length($t[3]) > 1) {
253 my @s = split(',', $t[4]);
255 $type = 3 if (length != length($t[3]));
258 my @s = split(',', $t[4]);
260 $type = 3 if (length > 1);
263 # clear the out-of-range elements
265 # Still on the same chromosome and the first element's window still affects this position?
266 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
267 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
271 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
272 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
273 $dp = $1 + $2 + $3 + $4;
276 if ($t[7] =~ /DP=(\d+)/i) {
279 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
280 # the depth and mapQ filter
282 if ($dp < $opts{d}) {
284 } elsif ($dp > $opts{D}) {
288 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
289 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
290 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
291 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
292 $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
294 my $score = $t[5] * 100 + $dp_alt;
295 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
297 if ($type == 3) { # an indel
298 # filtering SNPs and MNPs
299 for my $x (@staging) {
300 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
303 # check the staging list for indel filtering
304 for my $x (@staging) {
305 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
306 if ($x->[0]>>2 < $score) {
312 } else { # SNP or MNP
313 for my $x (@staging) {
314 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
315 if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
316 && length($x->[7]) - length($x->[6]) == 1) {
322 for my $x (@staging) {
323 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
324 if ($x->[0]>>2 < $score) {
332 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
334 # output the last few elements in the staging list
336 varFilter_aux(shift @staging, $opts{p});
341 my ($first, $is_print) = @_;
342 if ($first->[1] == 0) {
343 print join("\t", @$first[3 .. @$first-1]), "\n";
344 } elsif ($is_print) {
345 print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
351 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
355 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
356 my @s = split(',', $t[4]);
358 my $l = length($x) - length($t[3]) + 5000;
360 $l = -(length($x) - 1) + 5000;
361 } elsif ($x =~ /^\+/) {
362 $l = length($x) - 1 + 5000;
367 for (my $i = 0; $i < 10000; ++$i) {
368 next if ($c0[$i] == 0);
370 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
371 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
373 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
377 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
378 print "##fileformat=VCFv4.0\n";
379 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
382 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
387 $t[9] = reverse($t[9]);
388 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
390 my @a = split("/", $t[9]);
392 push(@alt, $_) if ($_ ne $alt[0]);
398 $alt[$_] = "N$alt[$_]";
401 my $ref = shift(@alt);
402 my $af = $t[13] > 0? ";AF=$t[13]" : '';
403 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
404 my $info = "molType=$t[10];class=$t[11]$valid$af";
405 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
410 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
411 my $fn = shift(@ARGV);
413 warn("Parsing UCSC SNPs...\n");
415 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
418 next if ($t[3] - $t[2] != 1); # not SNP
419 @{$map{$t[4]}} = @t[1,3,7];
423 warn("Writing VCF...\n");
424 print "##fileformat=VCFv4.0\n";
427 if ($t[0] eq 'rs#') { # the first line
428 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
430 next unless ($map{$t[0]});
431 next if (length($t[1]) != 3); # skip non-SNPs
432 my $a = \@{$map{$t[0]}};
434 my @u = split('/', $t[1]);
436 $u[1] = $u[0]; $u[0] = $ref;
437 } elsif ($u[0] ne $ref) { next; }
440 $w{$u[0]} = 0; $w{$u[1]} = 1;
441 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
447 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
448 if (!defined($a[0]) || !defined($a[1])) {
452 push(@s, "$a[0]/$a[1]");
456 print join("\t", @s), "\n";
462 my %opts = (d=>3, D=>100000, Q=>10, l=>5);
463 getopts('d:D:Q:l:', \%opts);
465 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
467 Options: -d INT minimum depth [$opts{d}]
468 -D INT maximum depth [$opts{D}]
469 -Q INT min RMS mapQ [$opts{Q}]
470 -l INT INDEL filtering window [$opts{l}]
471 \n/) if (@ARGV == 0 && -t STDIN);
473 my ($last_chr, $seq, $qual, $last_pos, @gaps);
478 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
479 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
485 if ($last_chr ne $t[0]) {
486 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
487 ($last_chr, $last_pos) = ($t[0], 0);
491 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
492 if ($t[1] - $last_pos > 1) {
493 $seq .= 'n' x ($t[1] - $last_pos - 1);
494 $qual .= '!' x ($t[1] - $last_pos - 1);
496 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
497 my ($ref, $alt) = ($t[3], $1);
499 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
501 $_ = $1 if ($t[7] =~ /AF1=([\d\.]+)/);
502 $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
505 $b = $het{"$ref$alt"};
509 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
510 $q = int($q + 33 + .499);
511 $q = chr($q <= 126? $q : 126);
515 push(@gaps, [$t[1], length($t[3])]);
519 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
522 sub v2q_post_process {
523 my ($chr, $seq, $qual, $gaps, $l) = @_;
525 my $beg = $g->[0] > $l? $g->[0] - $l : 0;
526 my $end = $g->[0] + $g->[1] + $l;
527 $end = length($$seq) if ($end > length($$seq));
528 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
530 print "\@$chr\n"; &v2q_print_str($seq);
531 print "+\n"; &v2q_print_str($qual);
537 for (my $i = 0; $i < $l; $i += 60) {
538 print substr($$s, $i, 60), "\n";
544 Usage: vcfutils.pl <command> [<arguments>]\n
545 Command: subsam get a subset of samples
546 listsam list the samples
547 fillac fill the allele count field
548 qstats SNP stats stratified by QUAL
550 hapmap2vcf convert the hapmap format to VCF
551 ucscsnp2vcf convert UCSC SNP SQL dump to VCF
553 varFilter filtering short variants (*)
554 vcf2fq VCF->fastq (**)
556 Notes: Commands with description endting with (*) may need bcftools
557 specific annotations.