From: Heng Li Date: Mon, 1 Jun 2009 13:04:39 +0000 (+0000) Subject: * samtools.pl-0.2.0 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=0032bb2050f78182baa7fbf9b959540dce4636bd;p=samtools.git * samtools.pl-0.2.0 * snpFilter --- diff --git a/misc/samtools.pl b/misc/samtools.pl index 8f109e0..6fba900 100755 --- a/misc/samtools.pl +++ b/misc/samtools.pl @@ -6,11 +6,11 @@ use strict; use warnings; use Getopt::Std; -my $version = '0.1.1'; +my $version = '0.2.0'; &usage if (@ARGV < 1); my $command = shift(@ARGV); -my %func = (indelFilter=>\&indelFilter, showALEN=>\&showALEN); +my %func = (snpFilter=>\&snpFilter, indelFilter=>\&indelFilter, showALEN=>\&showALEN); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; @@ -76,10 +76,74 @@ Options: -D INT maximum read depth [$opts{D}] print $last->[3] if (defined $last->[0]); } +# +# snpFilter +# + +sub snpFilter { + my %opts = (f=>'', Q=>40, d=>3, w=>10, D=>0, N=>2, W=>10, q=>20, s=>50); + getopts('fs:w:q:Q:d:D:W:N:', \%opts); + die(qq{ +Usage: samtools.pl snpFilter [options] + +Options: -d INT minimum depth to call a SNP [$opts{d}] + -D INT maximum depth, 0 to ignore [$opts{D}] + -Q INT required max mapping quality of the reads covering the SNP [$opts{Q}] + -q INT minimum SNP quality [$opts{q}] + + -f FILE filtered samtools indels [null] + -s INT minimum samtols indel score [$opts{s}] + -w INT SNP within INT bp around an indel to be filtered [$opts{w}] + + -W INT window size for filtering dense SNPs [$opts{W}] + -N INT maximum number of SNPs in a window [$opts{N}] +\n}) unless (@ARGV); + my (%hash, $fh); + my $skip = $opts{w}; + $opts{D} = 100000000 if ($opts{D} == 0); + if ($opts{f}) { # filtered samtools indel + my $n = 0; + open($fh, $opts{f}) || die; + while (<$fh>) { + my @t = split; + next if ($t[2] ne '*' || $t[3] eq '*/*' || $t[5] < $opts{s}); + for (my $x = $t[1] - $skip + 1; $x < $t[1] + $skip; ++$x) { + $hash{$t[0],$x} = 1; + } + } + close($fh); + } + my (@last, $last_chr); + $last_chr = ''; + while (<>) { + my @t = split; + next if ($t[2] eq '*' || $hash{$t[0],$t[1]}); + my $is_good = ($t[7] >= $opts{d} && $t[7] <= $opts{D} && $t[6] >= $opts{Q} && $t[5] >= $opts{q})? 1 : 0; + next unless ($is_good); # drop + if ($t[0] ne $last_chr) { # a different chr, print + map { print $_->{L} if ($_->{F}) } @last; + @last = (); + $last_chr = $t[0]; + } + # The following block implemented by Nathans Weeks. + push(@last, {L => $_, X => $t[1], F => 1}); # Enqueue current SNP + if ($#last == $opts{N}) { # number of SNPs in queue is N+1 + if ($last[$#last]{X} - $last[0]{X} < $opts{W}) { # if all within window W + map {$_->{F} = 0} @last; # all SNPs in the window of size W are "bad" + } + print STDOUT $last[0]{L} if ($last[0]{F}); # print first SNP if good + shift @last # dequeue first SNP + } + } + # print the last few lines if applicable + map { print $_->{L} if ($_->{F}) } @last; +} + sub usage { die(qq/ Usage: samtools.pl []\n Command: indelFilter filter indels generated by `pileup -c' + snpFilter filter SNPs generated by `pileup -c' showALEN print alignment length (ALEN) following CIGAR \n/); }