13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 my %opts = (l=>5000000);
24 getopts('l:', \%opts);
26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
30 for (my $i = 0; $i < $t[1];) {
31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
32 print "$t[0]:".($i+1)."-$e\n";
39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
41 my $fn = shift(@ARGV);
43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
44 $h{$_} = 1 for (@ARGV);
50 my @s = @t[0..8]; # all fixed fields + FORMAT
57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
58 print join("\t", @s), "\n";
62 print join("\t", @t[0..7]), "\n";
64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
76 print join("\n", @t[9..$#t]), "\n";
83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
92 @_ = split(":", $t[8]);
94 if ($_[$_] eq 'GT') { $s = $_; last; }
97 print join("\t", @t), "\n";
101 if ($t[$_] =~ /^0,0,0/) {
102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
109 $info =~ s/(;?)AC=(\d+)//;
110 $info =~ s/(;?)AN=(\d+)//;
117 print join("\t", @t), "\n";
124 getopts('t:', \%opts);
125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
126 my $cutoff = $opts{t};
127 my ($last, $lastchr) = (0x7fffffff, '');
128 my ($x, $y, $n) = (0, 0, 0);
130 if (/^([^#\s]+)\s(\d+)/) {
131 my ($chr, $pos) = ($1, $2);
132 if (/NEIR=([\d\.]+)/) {
134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
136 $last = $pos; $lastchr = $chr;
139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
140 print "Fraction: ", $y/$n, "\n";
141 print "Length: $x\n";
145 my %opts = (r=>'', s=>0.02, v=>undef);
146 getopts('r:s:v', \%opts);
147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
151 my $is_vcf = defined($opts{v})? 1 : 0;
152 if ($opts{r}) { # read the reference positions
154 open($fh, $opts{r}) || die;
159 $h{$t[0],$t[1]} = $t[4];
161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
166 my $hsize = scalar(keys %h);
171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
172 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
173 my @s = split(',', $t[4]);
174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
175 next if (length($s[0]) != 1);
179 my $aa = $h{$t[0],$t[1]};
181 my @aaa = split(",", $aa);
183 $hit = 1 if ($_ eq $s[0]);
187 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
191 push(@a, [-1, 0, 0, 0]); # end marker
192 die("[qstats] No SNP data!\n") if (@a == 0);
193 @a = sort {$b->[0]<=>$a->[0]} @a;
196 my @c = (0, 0, 0, 0);
200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
204 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
205 my $a = $c[1] - $lc[1];
206 my $b = $c[2] - $lc[2];
207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
208 print join("\t", $last, @c, @x), "\n";
209 $next = $c[0]/@a + $opts{s};
210 $lc[1] = $c[1]; $lc[2] = $c[2];
212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
218 my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000);
219 getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:', \%opts);
221 Usage: vcfutils.pl varFilter [options] <in.vcf>
223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
224 -d INT minimum read depth [$opts{d}]
225 -D INT maximum read depth [$opts{D}]
226 -a INT minimum number of alternate bases [$opts{a}]
227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
228 -W INT window size for filtering adjacent gaps [$opts{W}]
229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
230 -2 FLOAT min P-value for baseQ bias [$opts{2}]
231 -3 FLOAT min P-value for mapQ bias [$opts{3}]
232 -4 FLOAT min P-value for end distance bias [$opts{4}]
233 -p print filtered variants
235 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
236 \n/) if (@ARGV == 0 && -t STDIN);
238 # calculate the window size
239 my ($ol, $ow) = ($opts{W}, $opts{w});
240 my $max_dist = $ol > $ow? $ol : $ow;
242 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
248 next if ($t[4] eq '.'); # skip non-var sites
249 next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
250 # check if the site is a SNP
252 if (length($t[3]) > 1) {
254 my @s = split(',', $t[4]);
256 $type = 3 if (length != length($t[3]));
259 my @s = split(',', $t[4]);
261 $type = 3 if (length > 1);
264 # clear the out-of-range elements
266 # Still on the same chromosome and the first element's window still affects this position?
267 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
268 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
272 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
273 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
274 $dp = $1 + $2 + $3 + $4;
277 if ($t[7] =~ /DP=(\d+)/i) {
280 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
281 # the depth and mapQ filter
283 if ($dp < $opts{d}) {
285 } elsif ($dp > $opts{D}) {
289 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
290 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
291 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
292 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
293 $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
295 my $score = $t[5] * 100 + $dp_alt;
296 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
298 if ($type == 3) { # an indel
299 # filtering SNPs and MNPs
300 for my $x (@staging) {
301 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
304 # check the staging list for indel filtering
305 for my $x (@staging) {
306 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
307 if ($x->[0]>>2 < $score) {
313 } else { # SNP or MNP
314 for my $x (@staging) {
315 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
316 if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
317 && length($x->[7]) - length($x->[6]) == 1) {
323 for my $x (@staging) {
324 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
325 if ($x->[0]>>2 < $score) {
333 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
335 # output the last few elements in the staging list
337 varFilter_aux(shift @staging, $opts{p});
342 my ($first, $is_print) = @_;
343 if ($first->[1] == 0) {
344 print join("\t", @$first[3 .. @$first-1]), "\n";
345 } elsif ($is_print) {
346 print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
352 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
356 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
357 my @s = split(',', $t[4]);
359 my $l = length($x) - length($t[3]) + 5000;
361 $l = -(length($x) - 1) + 5000;
362 } elsif ($x =~ /^\+/) {
363 $l = length($x) - 1 + 5000;
368 for (my $i = 0; $i < 10000; ++$i) {
369 next if ($c0[$i] == 0);
371 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
372 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
374 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
378 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
379 print "##fileformat=VCFv4.0\n";
380 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
383 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
388 $t[9] = reverse($t[9]);
389 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
391 my @a = split("/", $t[9]);
393 push(@alt, $_) if ($_ ne $alt[0]);
399 $alt[$_] = "N$alt[$_]";
402 my $ref = shift(@alt);
403 my $af = $t[13] > 0? ";AF=$t[13]" : '';
404 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
405 my $info = "molType=$t[10];class=$t[11]$valid$af";
406 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
411 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
412 my $fn = shift(@ARGV);
414 warn("Parsing UCSC SNPs...\n");
416 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
419 next if ($t[3] - $t[2] != 1); # not SNP
420 @{$map{$t[4]}} = @t[1,3,7];
424 warn("Writing VCF...\n");
425 print "##fileformat=VCFv4.0\n";
428 if ($t[0] eq 'rs#') { # the first line
429 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
431 next unless ($map{$t[0]});
432 next if (length($t[1]) != 3); # skip non-SNPs
433 my $a = \@{$map{$t[0]}};
435 my @u = split('/', $t[1]);
437 $u[1] = $u[0]; $u[0] = $ref;
438 } elsif ($u[0] ne $ref) { next; }
441 $w{$u[0]} = 0; $w{$u[1]} = 1;
442 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
448 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
449 if (!defined($a[0]) || !defined($a[1])) {
453 push(@s, "$a[0]/$a[1]");
457 print join("\t", @s), "\n";
463 my %opts = (d=>3, D=>100000, Q=>10, l=>5);
464 getopts('d:D:Q:l:', \%opts);
466 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
468 Options: -d INT minimum depth [$opts{d}]
469 -D INT maximum depth [$opts{D}]
470 -Q INT min RMS mapQ [$opts{Q}]
471 -l INT INDEL filtering window [$opts{l}]
472 \n/) if (@ARGV == 0 && -t STDIN);
474 my ($last_chr, $seq, $qual, $last_pos, @gaps);
479 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
480 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
486 if ($last_chr ne $t[0]) {
487 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
488 ($last_chr, $last_pos) = ($t[0], 0);
492 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
493 if ($t[1] - $last_pos > 1) {
494 $seq .= 'n' x ($t[1] - $last_pos - 1);
495 $qual .= '!' x ($t[1] - $last_pos - 1);
497 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
498 my ($ref, $alt) = ($t[3], $1);
500 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
502 $_ = $1 if ($t[7] =~ /AF1=([\d\.]+)/);
503 $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
506 $b = $het{"$ref$alt"};
510 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
511 $q = int($q + 33 + .499);
512 $q = chr($q <= 126? $q : 126);
515 } elsif ($t[4] ne '.') { # an INDEL
516 push(@gaps, [$t[1], length($t[3])]);
520 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
523 sub v2q_post_process {
524 my ($chr, $seq, $qual, $gaps, $l) = @_;
526 my $beg = $g->[0] > $l? $g->[0] - $l : 0;
527 my $end = $g->[0] + $g->[1] + $l;
528 $end = length($$seq) if ($end > length($$seq));
529 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
531 print "\@$chr\n"; &v2q_print_str($seq);
532 print "+\n"; &v2q_print_str($qual);
538 for (my $i = 0; $i < $l; $i += 60) {
539 print substr($$s, $i, 60), "\n";
545 Usage: vcfutils.pl <command> [<arguments>]\n
546 Command: subsam get a subset of samples
547 listsam list the samples
548 fillac fill the allele count field
549 qstats SNP stats stratified by QUAL
551 hapmap2vcf convert the hapmap format to VCF
552 ucscsnp2vcf convert UCSC SNP SQL dump to VCF
554 varFilter filtering short variants (*)
555 vcf2fq VCF->fastq (**)
557 Notes: Commands with description endting with (*) may need bcftools
558 specific annotations.