X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bcftools%2Fvcfutils.pl;h=eed8766464e9b63e08268a97b146d3464705a042;hb=032937695157a15fb0ce042deda22afc30be1616;hp=c625b38d774a9ae8a1ba5d213a497bc9a1f18189;hpb=d11a5b1fdadf6847f9799ee0ff0bd14b7c431cea;p=samtools.git diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index c625b38..eed8766 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -14,7 +14,7 @@ sub main { &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, - hapmap2vcf=>\&hapmap2vcf); + hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@ -70,12 +70,21 @@ sub fillac { print; } else { my @t = split; - my @c; + my @c = (0); my $n = 0; - $c[1] = 0; + my $s = -1; + @_ = split(":", $t[8]); + for (0 .. $#_) { + if ($_[$_] eq 'GT') { $s = $_; last; } + } + if ($s < 0) { + print join("\t", @t), "\n"; + next; + } for (9 .. $#t) { - if ($t[$_] =~ /^(\d+).(\d+)/) { - ++$c[$1]; ++$c[$2]; + if ($t[$_] =~ /^0,0,0/) { + } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) { + ++$c[$2]; ++$c[$3]; $n += 2; } } @@ -95,18 +104,24 @@ sub fillac { } sub qstats { - my %opts = (r=>'', s=>0.01); - getopts('r:s:', \%opts); + my %opts = (r=>'', s=>0.02, v=>undef); + getopts('r:s:v', \%opts); die("Usage: vcfutils.pl qstats [-r ref.vcf] \n Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); my %h = (); + my $is_vcf = defined($opts{v})? 1 : 0; if ($opts{r}) { # read the reference positions my $fh; open($fh, $opts{r}) || die; while (<$fh>) { next if (/^#/); - $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); + if ($is_vcf) { + my @t = split; + $h{$t[0],$t[1]} = $t[4]; + } else { + $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); + } } close($fh); } @@ -120,7 +135,20 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # my @s = split(',', $t[4]); $t[5] = 3 if ($t[5] < 0); next if (length($s[0]) != 1); - push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $h{$t[0],$t[1]}? 1 : 0]); + my $hit; + if ($is_vcf) { + $hit = 0; + my $aa = $h{$t[0],$t[1]}; + if (defined($aa)) { + my @aaa = split(",", $aa); + for (@aaa) { + $hit = 1 if ($_ eq $s[0]); + } + } + } else { + $hit = defined($h{$t[0],$t[1]})? 1 : 0; + } + push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]); } push(@a, [-1, 0, 0, 0]); # end marker die("[qstats] No SNP data!\n") if (@a == 0); @@ -128,14 +156,20 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # my $next = $opts{s}; my $last = $a[0]; my @c = (0, 0, 0, 0); + my @lc; + $lc[1] = $lc[2] = 0; for my $p (@a) { if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { my @x; $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0); $x[2] = sprintf("%.4f", $c[3] / $c[1]); + my $a = $c[1] - $lc[1]; + my $b = $c[2] - $lc[2]; + $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100); print join("\t", $last, @c, @x), "\n"; $next = $c[0]/@a + $opts{s}; + $lc[1] = $c[1]; $lc[2] = $c[2]; } ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; $last = $p->[0]; @@ -282,6 +316,75 @@ sub varFilter_aux { } } +sub filter4vcf { + my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3); + getopts('d:D:1:2:3:4:Q:q:', \%opts); + die(qq/ +Usage: vcfutils.pl filter4vcf [options] + +Options: -d INT min total depth (given DP or DP4) [$opts{d}] + -D INT max total depth [$opts{D}] + -q INT min SNP quality [$opts{q}] + -Q INT min RMS mapQ (given MQ) [$opts{Q}] + -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] + -2 FLOAT min P-value for baseQ bias [$opts{2}] + -3 FLOAT min P-value for mapQ bias [$opts{3}] + -4 FLOAT min P-value for end distance bias [$opts{4}]\n +/) if (@ARGV == 0 && -t STDIN); + + my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); + + my @n = (0, 0); + while (<>) { + next if (/^#/); + next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + my $depth = -1; + $depth = $1 if (/DP=(\d+)/); + $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/); + next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D})); + next if (/MQ=(\d+)/ && $1 < $opts{Q}); + my @t = split; + next if ($t[5] >= 0 && $t[5] < $opts{q}); + ++$n[0]; + my @s = split(',', $t[4]); + ++$n[1] if ($ts{$t[3].$s[0]}); + print; + } +} + +sub ucscsnp2vcf { + die("Usage: vcfutils.pl \n") if (@ARGV == 0 && -t STDIN); + print "##fileformat=VCFv4.0\n"; + print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n"; + while (<>) { + my @t = split("\t"); + my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1; + my $pos = $t[2] + 1; + my @alt; + push(@alt, $t[7]); + if ($t[6] eq '-') { + $t[9] = reverse($t[9]); + $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; + } + my @a = split("/", $t[9]); + for (@a) { + push(@alt, $_) if ($_ ne $alt[0]); + } + if ($indel) { + --$pos; + for (0 .. $#alt) { + $alt[$_] =~ tr/-//d; + $alt[$_] = "N$alt[$_]"; + } + } + my $ref = shift(@alt); + my $af = $t[13] > 0? ";AF=$t[13]" : ''; + my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]"; + my $info = "molType=$t[10];class=$t[11]$valid$af"; + print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n"; + } +} + sub hapmap2vcf { die("Usage: vcfutils.pl \n") if (@ARGV == 0); my $fn = shift(@ARGV); @@ -342,6 +445,8 @@ Command: subsam get a subset of samples fillac fill the allele count field qstats SNP stats stratified by QUAL varFilter filtering short variants + filter4vcf filtering VCFs produced by samtools+bcftools hapmap2vcf convert the hapmap format to VCF + ucscsnp2vcf convert UCSC SNP SQL dump to VCF \n/); }