From: Heng Li Date: Wed, 1 Sep 2010 16:50:33 +0000 (+0000) Subject: more commands for my own uses X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=7ef82722e41143d313b37394b905239ae61ca19a;p=samtools.git more commands for my own uses --- diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index bf77049..0fe1af8 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -13,7 +13,7 @@ sub main { my $version = '0.1.0'; &usage if (@ARGV < 1); my $command = shift(@ARGV); - my %func = (subsam=>\&subsam); + my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@ -51,9 +51,101 @@ sub subsam { close($fh); } +sub listsam { + die(qq/Usage: vcfutils.pl listsam \n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + if (/^#/ && !/^##/) { + my @t = split; + print join("\n", @t[9..$#t]), "\n"; + exit; + } + } +} + +sub fillac { + die(qq/Usage: vcfutils.pl fillac \n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + if (/^#/) { + print; + } else { + my @t = split; + my @c; + my $n = 0; + $c[1] = 0; + for (9 .. $#t) { + if ($t[$_] =~ /^(\d+).(\d+)/) { + ++$c[$1]; ++$c[$2]; + $n += 2; + } + } + my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n"; + my $info = $t[7]; + $info =~ s/(;?)AC=(\d+)//; + $info =~ s/(;?)AN=(\d+)//; + if ($info eq '.') { + $info = $AC; + } else { + $info .= ";$AC"; + } + $t[7] = $info; + print join("\t", @t), "\n"; + } + } +} + +sub qstats { + my %opts = (r=>'', s=>0.01); + getopts('r:s:', \%opts); + die("Usage: vcfutils.pl qstats [-r ref.vcf] \n +Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); + my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); + my %h = (); + if ($opts{r}) { # read the reference positions + my $fh; + open($fh, $opts{r}) || die; + while (<$fh>) { + next if (/^#/); + $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); + } + close($fh); + } + my $hsize = scalar(keys %h); + my @a; + while (<>) { + next if (/^#/); + my @t = split; + next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); + my @s = split(',', $t[4]); + $t[3] = uc($t[3]); $t[4] = uc($t[4]); + next if (length($s[0]) != 1); + push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $h{$t[0],$t[1]}? 1 : 0]); + } + push(@a, [-1, 0, 0, 0]); # end marker + die("[qstats] No SNP data!\n") if (@a == 0); + @a = sort {$b->[0]<=>$a->[0]} @a; + my $next = $opts{s}; + my $last = $a[0]; + my @c = (0, 0, 0, 0); + for my $p (@a) { + if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { + my @x; + $x[0] = sprintf("%.3f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); + $x[1] = sprintf("%.3f", $hsize? $c[3] / $hsize : 0); + $x[2] = sprintf("%.3f", $c[3] / $c[0]); + print join("\t", $last, @c, @x), "\n"; + $next = $c[0]/@a + $opts{s}; + } + ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; + $last = $p->[0]; + } +} + sub usage { die(qq/ Usage: vcfutils.pl []\n Command: subsam get a subset of samples + listsam list the samples + fillac fill the allele count field + qstats SNP stats stratified by QUAL \n/); }