13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
203 getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -d INT minimum read depth [$opts{d}]
209 -D INT maximum read depth [$opts{D}]
210 -a INT minimum number of alternate bases [$opts{a}]
211 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
212 -W INT window size for filtering adjacent gaps [$opts{W}]
213 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
214 -2 FLOAT min P-value for baseQ bias [$opts{2}]
215 -3 FLOAT min P-value for mapQ bias [$opts{3}]
216 -4 FLOAT min P-value for end distance bias [$opts{4}]
217 -p print filtered variants
219 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
220 \n/) if (@ARGV == 0 && -t STDIN);
222 # calculate the window size
223 my ($ol, $ow) = ($opts{W}, $opts{w});
224 my $max_dist = $ol > $ow? $ol : $ow;
226 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
232 next if ($t[4] eq '.'); # skip non-var sites
233 # check if the site is a SNP
235 if (length($t[3]) > 1) {
238 my @s = split(',', $t[4]);
240 $is_snp = 0 if (length > 1);
243 # clear the out-of-range elements
245 # Still on the same chromosome and the first element's window still affects this position?
246 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
247 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
251 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
252 if ($t[7] =~ /DP=(\d+)/i) {
254 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
255 $dp = $1 + $2 + $3 + $4;
258 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
259 # the depth and mapQ filter
261 if ($dp < $opts{d}) {
263 } elsif ($dp > $opts{D}) {
267 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
268 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
269 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
270 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
272 # site dependent filters
273 my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
275 if (!$is_snp) { # an indel
276 $rlen = length($t[3]) - 1;
277 $indel_score = $t[5] * 100 + $dp_alt;
279 for my $x (@staging) {
280 next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
283 # check the staging list for indel filtering
284 for my $x (@staging) {
285 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
286 if ($x->[0] < $indel_score) {
293 for my $x (@staging) {
294 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
300 push(@staging, [$indel_score, $flt, $rlen, @t]);
302 # output the last few elements in the staging list
304 varFilter_aux(shift @staging, $opts{p});
309 my ($first, $is_print) = @_;
310 if ($first->[1] == 0) {
311 print join("\t", @$first[3 .. @$first-1]), "\n";
312 } elsif ($is_print) {
313 print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
319 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
323 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
324 my @s = split(',', $t[4]);
326 my $l = length($x) - length($t[3]) + 5000;
330 for (my $i = 0; $i < 10000; ++$i) {
331 next if ($c0[$i] == 0);
332 printf("%d\t%.2f\n", ($i-5000), $c0[$i]);
337 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
338 print "##fileformat=VCFv4.0\n";
339 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
342 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
347 $t[9] = reverse($t[9]);
348 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
350 my @a = split("/", $t[9]);
352 push(@alt, $_) if ($_ ne $alt[0]);
358 $alt[$_] = "N$alt[$_]";
361 my $ref = shift(@alt);
362 my $af = $t[13] > 0? ";AF=$t[13]" : '';
363 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
364 my $info = "molType=$t[10];class=$t[11]$valid$af";
365 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
370 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
371 my $fn = shift(@ARGV);
373 warn("Parsing UCSC SNPs...\n");
375 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
378 next if ($t[3] - $t[2] != 1); # not SNP
379 @{$map{$t[4]}} = @t[1,3,7];
383 warn("Writing VCF...\n");
384 print "##fileformat=VCFv4.0\n";
387 if ($t[0] eq 'rs#') { # the first line
388 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
390 next unless ($map{$t[0]});
391 next if (length($t[1]) != 3); # skip non-SNPs
392 my $a = \@{$map{$t[0]}};
394 my @u = split('/', $t[1]);
396 $u[1] = $u[0]; $u[0] = $ref;
397 } elsif ($u[0] ne $ref) { next; }
400 $w{$u[0]} = 0; $w{$u[1]} = 1;
401 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
407 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
408 if (!defined($a[0]) || !defined($a[1])) {
412 push(@s, "$a[0]/$a[1]");
416 print join("\t", @s), "\n";
423 Usage: vcfutils.pl <command> [<arguments>]\n
424 Command: subsam get a subset of samples
425 listsam list the samples
426 fillac fill the allele count field
427 qstats SNP stats stratified by QUAL
428 varFilter filtering short variants
429 hapmap2vcf convert the hapmap format to VCF
430 ucscsnp2vcf convert UCSC SNP SQL dump to VCF