13 my $version = '0.1.0';
14 &usage if (@ARGV < 1);
15 my $command = shift(@ARGV);
16 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter);
17 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
22 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
24 my $fn = shift(@ARGV);
26 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
27 $h{$_} = 1 for (@ARGV);
33 my @s = @t[0..8]; # all fixed fields + FORMAT
40 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
41 print join("\t", @s), "\n";
45 print join("\t", @t[0..7]), "\n";
47 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
55 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
59 print join("\n", @t[9..$#t]), "\n";
66 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 if ($t[$_] =~ /^(\d+).(\d+)/) {
81 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
83 $info =~ s/(;?)AC=(\d+)//;
84 $info =~ s/(;?)AN=(\d+)//;
91 print join("\t", @t), "\n";
97 my %opts = (r=>'', s=>0.01);
98 getopts('r:s:', \%opts);
99 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
100 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
101 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
103 if ($opts{r}) { # read the reference positions
105 open($fh, $opts{r}) || die;
108 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
112 my $hsize = scalar(keys %h);
117 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
118 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
119 my @s = split(',', $t[4]);
120 $t[5] = 3 if ($t[5] < 0);
121 next if (length($s[0]) != 1);
122 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $h{$t[0],$t[1]}? 1 : 0]);
124 push(@a, [-1, 0, 0, 0]); # end marker
125 die("[qstats] No SNP data!\n") if (@a == 0);
126 @a = sort {$b->[0]<=>$a->[0]} @a;
129 my @c = (0, 0, 0, 0);
131 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
133 $x[0] = sprintf("%.3f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
134 $x[1] = sprintf("%.3f", $hsize? $c[3] / $hsize : 0);
135 $x[2] = sprintf("%.3f", $c[3] / $c[1]);
136 print join("\t", $last, @c, @x), "\n";
137 $next = $c[0]/@a + $opts{s};
139 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
145 my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001);
146 getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts);
148 Usage: vcfutils.pl varFilter [options] <in.vcf>
150 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
151 -q INT minimum RMS mapping quality for gaps [$opts{q}]
152 -d INT minimum read depth [$opts{d}]
153 -D INT maximum read depth [$opts{D}]
155 -G INT min indel score for nearby SNP filtering [$opts{G}]
156 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
158 -W INT window size for filtering dense SNPs [$opts{W}]
159 -N INT max number of SNPs in a window [$opts{N}]
161 -l INT window size for filtering adjacent gaps [$opts{l}]
163 -p print filtered variants
164 \n/) if (@ARGV == 0 && -t STDIN);
166 # calculate the window size
167 my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
168 my $max_dist = $ol > $ow? $ol : $ow;
169 $max_dist = $oW if ($max_dist < $oW);
171 my @staging; # (indel_filtering_score, flt_tag)
175 next if ($t[4] eq '.'); # skip non-var sites
177 if (length($t[3]) > 1) {
180 my @s = split(',', $t[4]);
182 $is_snp = 0 if (length > 1);
185 # clear the out-of-range elements
187 # Still on the same chromosome and the first element's window still affects this position?
188 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
189 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
191 my ($flt, $score) = (0, -1);
193 # collect key annotations
194 my ($dp, $mq, $af) = (-1, -1, 1);
195 if ($t[7] =~ /DP=(\d+)/i) {
197 } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
198 $dp = $1 + $2 + $3 + $4;
200 if ($t[7] =~ /MQ=(\d+)/i) {
203 if ($t[7] =~ /AF=([^\s;=]+)/i) {
205 } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) {
210 if ($dp < $opts{d}) {
212 } elsif ($dp > $opts{D}) {
217 # site dependent filters
220 if (!$is_snp) { # an indel
221 # If deletion, remember the length of the deletion
222 $dlen = length($t[3]) - 1;
223 $flt = 1 if ($mq < $opts{q});
225 if ($t[5] >= $opts{G}) {
226 for my $x (@staging) {
227 # Is it a SNP and is it outside the SNP filter window?
228 next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
229 $x->[1] = 5 if ($x->[1] == 0);
232 # the indel filtering score
234 # check the staging list for indel filtering
235 for my $x (@staging) {
236 # Is it a SNP and is it outside the gap filter window
237 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
238 if ($x->[0] < $score) {
245 $flt = 1 if ($mq < $opts{Q});
246 # check adjacent SNPs
248 for my $x (@staging) {
249 ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
251 # filtering is necessary
254 for my $x (@staging) {
255 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
257 } else { # then check gap filter
258 for my $x (@staging) {
259 next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
260 if ($x->[0] >= $opts{G}) {
267 push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]);
269 # output the last few elements in the staging list
271 varFilter_aux(shift @staging, $opts{p});
276 my ($first, $is_print) = @_;
277 if ($first->[1] == 0) {
278 print join("\t", @$first[3 .. @$first-1]), "\n";
279 } elsif ($is_print) {
280 print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
286 Usage: vcfutils.pl <command> [<arguments>]\n
287 Command: subsam get a subset of samples
288 listsam list the samples
289 fillac fill the allele count field
290 qstats SNP stats stratified by QUAL
291 varFilter filtering short variants