my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
- gapstats=>\&gapstats);
+ gapstats=>\&gapstats, splitchr=>\&splitchr);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
+sub splitchr {
+ my %opts = (l=>5000000);
+ getopts('l:', \%opts);
+ my $l = $opts{l};
+ die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ my $last = 0;
+ for (my $i = 0; $i < $t[1];) {
+ my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
+ print "$t[0]:".($i+1)."-$e\n";
+ $i = $e;
+ }
+ }
+}
+
sub subsam {
die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
my ($fh, %h);
}
next if ($t[4] eq '.'); # skip non-var sites
# check if the site is a SNP
- my $is_snp = 1;
+ my $type = 1; # SNP
if (length($t[3]) > 1) {
- $is_snp = 0;
+ $type = 2; # MNP
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length != length($t[3]));
+ }
} else {
my @s = split(',', $t[4]);
for (@s) {
- $is_snp = 0 if (length > 1);
+ $type = 3 if (length > 1);
}
}
# clear the out-of-range elements
my $flt = 0;
# parse annotations
my ($dp, $mq, $dp_alt) = (-1, -1, -1);
- if ($t[7] =~ /DP=(\d+)/i) {
- $dp = $1;
- } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
+ if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
$dp = $1 + $2 + $3 + $4;
$dp_alt = $3 + $4;
}
+ if ($t[7] =~ /DP=(\d+)/i) {
+ $dp = $1;
+ }
$mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
# the depth and mapQ filter
if ($dp >= 0) {
$flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
&& ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- # site dependent filters
- my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
+ my $score = $t[5] * 100 + $dp_alt;
+ my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
if ($flt == 0) {
- if (!$is_snp) { # an indel
- $rlen = length($t[3]) - 1;
- $indel_score = $t[5] * 100 + $dp_alt;
- # filtering SNPs
+ if ($type == 3) { # an indel
+ # filtering SNPs and MNPs
for my $x (@staging) {
- next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
$x->[1] = 5;
}
# check the staging list for indel filtering
for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
- if ($x->[0] < $indel_score) {
+ next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0]>>2 < $score) {
$x->[1] = 6;
} else {
$flt = 6; last;
}
}
- } else { # a SNP
+ } else { # SNP or MNP
for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
$flt = 5;
last;
}
+ # check MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 8;
+ } else {
+ $flt = 8; last;
+ }
+ }
}
}
- push(@staging, [$indel_score, $flt, $rlen, @t]);
+ push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
}
# output the last few elements in the staging list
while (@staging) {
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
my @s = split(',', $t[4]);
for my $x (@s) {
my $l = length($x) - length($t[3]) + 5000;
+ if ($x =~ /^-/) {
+ $l = -(length($x) - 1) + 5000;
+ } elsif ($x =~ /^\+/) {
+ $l = length($x) - 1 + 5000;
+ }
$c0[$l] += 1 / @s;
}
}
for (my $i = 0; $i < 10000; ++$i) {
next if ($c0[$i] == 0);
- printf("%d\t%.2f\n", ($i-5000), $c0[$i]);
+ $c1[0] += $c0[$i];
+ $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
+ printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
}
+ printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
}
sub ucscsnp2vcf {