13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
203 getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -d INT minimum read depth [$opts{d}]
209 -D INT maximum read depth [$opts{D}]
210 -a INT minimum number of alternate bases [$opts{a}]
211 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
212 -W INT window size for filtering adjacent gaps [$opts{W}]
213 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
214 -2 FLOAT min P-value for baseQ bias [$opts{2}]
215 -3 FLOAT min P-value for mapQ bias [$opts{3}]
216 -4 FLOAT min P-value for end distance bias [$opts{4}]
217 -p print filtered variants
219 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
220 \n/) if (@ARGV == 0 && -t STDIN);
222 # calculate the window size
223 my ($ol, $ow) = ($opts{W}, $opts{w});
224 my $max_dist = $ol > $ow? $ol : $ow;
226 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
232 next if ($t[4] eq '.'); # skip non-var sites
233 # check if the site is a SNP
235 if (length($t[3]) > 1) {
237 my @s = split(',', $t[4]);
239 $type = 3 if (length != length($t[3]));
242 my @s = split(',', $t[4]);
244 $type = 3 if (length > 1);
247 # clear the out-of-range elements
249 # Still on the same chromosome and the first element's window still affects this position?
250 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
251 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
255 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
256 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
257 $dp = $1 + $2 + $3 + $4;
260 if ($t[7] =~ /DP=(\d+)/i) {
263 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
264 # the depth and mapQ filter
266 if ($dp < $opts{d}) {
268 } elsif ($dp > $opts{D}) {
272 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
273 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
274 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
275 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
277 my $score = $t[5] * 100 + $dp_alt;
278 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
280 if ($type == 3) { # an indel
281 # filtering SNPs and MNPs
282 for my $x (@staging) {
283 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
286 # check the staging list for indel filtering
287 for my $x (@staging) {
288 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
289 if ($x->[0]>>2 < $score) {
295 } else { # SNP or MNP
296 for my $x (@staging) {
297 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
302 for my $x (@staging) {
303 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
304 if ($x->[0]>>2 < $score) {
312 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
314 # output the last few elements in the staging list
316 varFilter_aux(shift @staging, $opts{p});
321 my ($first, $is_print) = @_;
322 if ($first->[1] == 0) {
323 print join("\t", @$first[3 .. @$first-1]), "\n";
324 } elsif ($is_print) {
325 print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
331 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
335 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
336 my @s = split(',', $t[4]);
338 my $l = length($x) - length($t[3]) + 5000;
340 $l = -(length($x) - 1) + 5000;
341 } elsif ($x =~ /^\+/) {
342 $l = length($x) - 1 + 5000;
347 for (my $i = 0; $i < 10000; ++$i) {
348 next if ($c0[$i] == 0);
350 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
351 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
353 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
357 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
358 print "##fileformat=VCFv4.0\n";
359 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
362 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
367 $t[9] = reverse($t[9]);
368 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
370 my @a = split("/", $t[9]);
372 push(@alt, $_) if ($_ ne $alt[0]);
378 $alt[$_] = "N$alt[$_]";
381 my $ref = shift(@alt);
382 my $af = $t[13] > 0? ";AF=$t[13]" : '';
383 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
384 my $info = "molType=$t[10];class=$t[11]$valid$af";
385 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
390 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
391 my $fn = shift(@ARGV);
393 warn("Parsing UCSC SNPs...\n");
395 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
398 next if ($t[3] - $t[2] != 1); # not SNP
399 @{$map{$t[4]}} = @t[1,3,7];
403 warn("Writing VCF...\n");
404 print "##fileformat=VCFv4.0\n";
407 if ($t[0] eq 'rs#') { # the first line
408 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
410 next unless ($map{$t[0]});
411 next if (length($t[1]) != 3); # skip non-SNPs
412 my $a = \@{$map{$t[0]}};
414 my @u = split('/', $t[1]);
416 $u[1] = $u[0]; $u[0] = $ref;
417 } elsif ($u[0] ne $ref) { next; }
420 $w{$u[0]} = 0; $w{$u[1]} = 1;
421 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
427 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
428 if (!defined($a[0]) || !defined($a[1])) {
432 push(@s, "$a[0]/$a[1]");
436 print join("\t", @s), "\n";
443 Usage: vcfutils.pl <command> [<arguments>]\n
444 Command: subsam get a subset of samples
445 listsam list the samples
446 fillac fill the allele count field
447 qstats SNP stats stratified by QUAL
448 varFilter filtering short variants
449 hapmap2vcf convert the hapmap format to VCF
450 ucscsnp2vcf convert UCSC SNP SQL dump to VCF