13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>1, D=>10000, l=>10, Q=>10, s=>100, w=>10, p=>undef, F=>.001);
203 getopts('pd:D:l:Q:w:G:F:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -d INT minimum read depth [$opts{d}]
209 -D INT maximum read depth [$opts{D}]
210 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
211 -l INT window size for filtering adjacent gaps [$opts{l}]
212 -p print filtered variants
213 \n/) if (@ARGV == 0 && -t STDIN);
215 # calculate the window size
216 my ($ol, $ow) = ($opts{l}, $opts{w});
217 my $max_dist = $ol > $ow? $ol : $ow;
219 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
223 next if ($t[4] eq '.'); # skip non-var sites
225 if (length($t[3]) > 1) {
228 my @s = split(',', $t[4]);
230 $is_snp = 0 if (length > 1);
233 # clear the out-of-range elements
235 # Still on the same chromosome and the first element's window still affects this position?
236 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
237 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
242 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
243 if ($t[7] =~ /DP=(\d+)/i) {
245 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
246 $dp = $1 + $2 + $3 + $4;
249 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
250 # the depth and mapQ filter
252 if ($dp < $opts{d}) {
254 } elsif ($dp > $opts{D}) {
258 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
260 # site dependent filters
261 my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
263 if (!$is_snp) { # an indel
264 $rlen = length($t[3]) - 1;
265 $indel_score = $t[5] * 100 + $dp_alt;
267 for my $x (@staging) {
268 next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
271 # check the staging list for indel filtering
272 for my $x (@staging) {
273 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
274 if ($x->[0] < $indel_score) {
281 for my $x (@staging) {
282 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
288 push(@staging, [$indel_score, $flt, $rlen, @t]);
290 # output the last few elements in the staging list
292 varFilter_aux(shift @staging, $opts{p});
297 my ($first, $is_print) = @_;
298 if ($first->[1] == 0) {
299 print join("\t", @$first[3 .. @$first-1]), "\n";
300 } elsif ($is_print) {
301 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
306 my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
307 getopts('d:D:1:2:3:4:Q:q:', \%opts);
309 Usage: vcfutils.pl filter4vcf [options] <in.vcf>
311 Options: -d INT min total depth (given DP or DP4) [$opts{d}]
312 -D INT max total depth [$opts{D}]
313 -q INT min SNP quality [$opts{q}]
314 -Q INT min RMS mapQ (given MQ) [$opts{Q}]
315 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
316 -2 FLOAT min P-value for baseQ bias [$opts{2}]
317 -3 FLOAT min P-value for mapQ bias [$opts{3}]
318 -4 FLOAT min P-value for end distance bias [$opts{4}]\n
319 /) if (@ARGV == 0 && -t STDIN);
321 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
329 next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
331 $depth = $1 if (/DP=(\d+)/);
332 $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
333 next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
334 next if (/MQ=(\d+)/ && $1 < $opts{Q});
336 next if ($t[5] >= 0 && $t[5] < $opts{q});
338 my @s = split(',', $t[4]);
339 ++$n[1] if ($ts{$t[3].$s[0]});
345 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
346 print "##fileformat=VCFv4.0\n";
347 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
350 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
355 $t[9] = reverse($t[9]);
356 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
358 my @a = split("/", $t[9]);
360 push(@alt, $_) if ($_ ne $alt[0]);
366 $alt[$_] = "N$alt[$_]";
369 my $ref = shift(@alt);
370 my $af = $t[13] > 0? ";AF=$t[13]" : '';
371 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
372 my $info = "molType=$t[10];class=$t[11]$valid$af";
373 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
378 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
379 my $fn = shift(@ARGV);
381 warn("Parsing UCSC SNPs...\n");
383 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
386 next if ($t[3] - $t[2] != 1); # not SNP
387 @{$map{$t[4]}} = @t[1,3,7];
391 warn("Writing VCF...\n");
392 print "##fileformat=VCFv4.0\n";
395 if ($t[0] eq 'rs#') { # the first line
396 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
398 next unless ($map{$t[0]});
399 next if (length($t[1]) != 3); # skip non-SNPs
400 my $a = \@{$map{$t[0]}};
402 my @u = split('/', $t[1]);
404 $u[1] = $u[0]; $u[0] = $ref;
405 } elsif ($u[0] ne $ref) { next; }
408 $w{$u[0]} = 0; $w{$u[1]} = 1;
409 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
415 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
416 if (!defined($a[0]) || !defined($a[1])) {
420 push(@s, "$a[0]/$a[1]");
424 print join("\t", @s), "\n";
431 Usage: vcfutils.pl <command> [<arguments>]\n
432 Command: subsam get a subset of samples
433 listsam list the samples
434 fillac fill the allele count field
435 qstats SNP stats stratified by QUAL
436 varFilter filtering short variants
437 filter4vcf filtering VCFs produced by samtools+bcftools
438 hapmap2vcf convert the hapmap format to VCF
439 ucscsnp2vcf convert UCSC SNP SQL dump to VCF