13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
17 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>1, D=>10000, a=>2, l=>10, Q=>10, s=>100, w=>10, p=>undef, F=>.001);
203 getopts('pd:D:l:Q:w:G:F:a:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -d INT minimum read depth [$opts{d}]
209 -D INT maximum read depth [$opts{D}]
210 -a INT minimum number of alternate bases [$opts{a}]
211 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
212 -l INT window size for filtering adjacent gaps [$opts{l}]
213 -p print filtered variants
214 \n/) if (@ARGV == 0 && -t STDIN);
216 # calculate the window size
217 my ($ol, $ow) = ($opts{l}, $opts{w});
218 my $max_dist = $ol > $ow? $ol : $ow;
220 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
224 next if ($t[4] eq '.'); # skip non-var sites
226 if (length($t[3]) > 1) {
229 my @s = split(',', $t[4]);
231 $is_snp = 0 if (length > 1);
234 # clear the out-of-range elements
236 # Still on the same chromosome and the first element's window still affects this position?
237 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
238 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
243 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
244 if ($t[7] =~ /DP=(\d+)/i) {
246 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
247 $dp = $1 + $2 + $3 + $4;
250 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
251 # the depth and mapQ filter
253 if ($dp < $opts{d}) {
255 } elsif ($dp > $opts{D}) {
259 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
260 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
262 # site dependent filters
263 my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
265 if (!$is_snp) { # an indel
266 $rlen = length($t[3]) - 1;
267 $indel_score = $t[5] * 100 + $dp_alt;
269 for my $x (@staging) {
270 next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
273 # check the staging list for indel filtering
274 for my $x (@staging) {
275 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
276 if ($x->[0] < $indel_score) {
283 for my $x (@staging) {
284 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
290 push(@staging, [$indel_score, $flt, $rlen, @t]);
292 # output the last few elements in the staging list
294 varFilter_aux(shift @staging, $opts{p});
299 my ($first, $is_print) = @_;
300 if ($first->[1] == 0) {
301 print join("\t", @$first[3 .. @$first-1]), "\n";
302 } elsif ($is_print) {
303 print STDERR join("\t", substr("UQdDaGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
308 my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
309 getopts('d:D:1:2:3:4:Q:q:', \%opts);
311 Usage: vcfutils.pl filter4vcf [options] <in.vcf>
313 Options: -d INT min total depth (given DP or DP4) [$opts{d}]
314 -D INT max total depth [$opts{D}]
315 -q INT min SNP quality [$opts{q}]
316 -Q INT min RMS mapQ (given MQ) [$opts{Q}]
317 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
318 -2 FLOAT min P-value for baseQ bias [$opts{2}]
319 -3 FLOAT min P-value for mapQ bias [$opts{3}]
320 -4 FLOAT min P-value for end distance bias [$opts{4}]\n
321 /) if (@ARGV == 0 && -t STDIN);
323 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
331 next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
333 $depth = $1 if (/DP=(\d+)/);
334 $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
335 next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
336 next if (/MQ=(\d+)/ && $1 < $opts{Q});
338 next if ($t[5] >= 0 && $t[5] < $opts{q});
340 my @s = split(',', $t[4]);
341 ++$n[1] if ($ts{$t[3].$s[0]});
347 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
348 print "##fileformat=VCFv4.0\n";
349 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
352 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
357 $t[9] = reverse($t[9]);
358 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
360 my @a = split("/", $t[9]);
362 push(@alt, $_) if ($_ ne $alt[0]);
368 $alt[$_] = "N$alt[$_]";
371 my $ref = shift(@alt);
372 my $af = $t[13] > 0? ";AF=$t[13]" : '';
373 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
374 my $info = "molType=$t[10];class=$t[11]$valid$af";
375 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
380 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
381 my $fn = shift(@ARGV);
383 warn("Parsing UCSC SNPs...\n");
385 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
388 next if ($t[3] - $t[2] != 1); # not SNP
389 @{$map{$t[4]}} = @t[1,3,7];
393 warn("Writing VCF...\n");
394 print "##fileformat=VCFv4.0\n";
397 if ($t[0] eq 'rs#') { # the first line
398 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
400 next unless ($map{$t[0]});
401 next if (length($t[1]) != 3); # skip non-SNPs
402 my $a = \@{$map{$t[0]}};
404 my @u = split('/', $t[1]);
406 $u[1] = $u[0]; $u[0] = $ref;
407 } elsif ($u[0] ne $ref) { next; }
410 $w{$u[0]} = 0; $w{$u[1]} = 1;
411 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
417 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
418 if (!defined($a[0]) || !defined($a[1])) {
422 push(@s, "$a[0]/$a[1]");
426 print join("\t", @s), "\n";
433 Usage: vcfutils.pl <command> [<arguments>]\n
434 Command: subsam get a subset of samples
435 listsam list the samples
436 fillac fill the allele count field
437 qstats SNP stats stratified by QUAL
438 varFilter filtering short variants
439 filter4vcf filtering VCFs produced by samtools+bcftools
440 hapmap2vcf convert the hapmap format to VCF
441 ucscsnp2vcf convert UCSC SNP SQL dump to VCF