X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=misc%2Fwgsim_eval.pl;h=f919a0643a0d2950d8f817c6213a3183f4f9a6dc;hb=9f118264ea012adc21a46d7c03eaad4f9ce7d4d4;hp=fe91a3d32403f19c8e5c5a8d330a94fd69f728e6;hpb=7a38b29371a36c4a066a080b178d6ab42494d73a;p=samtools.git diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl index fe91a3d..f919a06 100755 --- a/misc/wgsim_eval.pl +++ b/misc/wgsim_eval.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl -w # Contact: lh3 -# Version: 0.1.2 +# Version: 0.1.5 use strict; use warnings; @@ -11,13 +11,18 @@ use Getopt::Std; exit; sub wgsim_eval { - my %opts; - getopts('p', \%opts); - die("Usage: wgsim_eval.pl [-p] \n") if (@ARGV == 0 && -t STDIN); - my (@c0, @c1); - my $max_q = 0; + my %opts = (g=>5); + getopts('pcag:', \%opts); + die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] \n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1, %fnfp); + my ($max_q, $flag) = (0, 0); + my $gap = $opts{g}; + $flag |= 1 if (defined $opts{p}); + $flag |= 2 if (defined $opts{c}); while (<>) { - my @t = split; + next if (/^\@/); + my @t = split("\t"); + next if (@t < 11); my $line = $_; my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); $max_q = $q if ($q > $max_q); @@ -25,36 +30,62 @@ sub wgsim_eval { $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; --$rght; # correct for soft clipping - $left -= $1 if (/^(\d+)S/); - $rght += $1 if (/(\d+)S$/); + my ($left0, $rght0) = ($left, $rght); + $left -= $1 if (/^(\d+)[SH]/); + $rght += $1 if (/(\d+)[SH]$/); + $left0 -= $1 if (/(\d+)[SH]$/); + $rght0 += $1 if (/^(\d+)[SH]/); # skip unmapped reads next if (($t[1]&0x4) || $chr eq '*'); # parse read name and check - ++$c0[$q]; if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { if ($1 ne $chr) { # different chr $is_correct = 0; - } elsif ($2 < $3) { # should always be true if reads are generated from wgsim - if ($t[1] & 0x10) { # reverse - $is_correct = 0 if (abs($3 - $rght) > 5); # in case of indels that are close to the end of a reads + } else { + if ($flag & 2) { + if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward + $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); + } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse + $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); + } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward + $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap); + } else { # R3, reverse + $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap); + } } else { - $is_correct = 0 if (abs($2 - $left) > 5); + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); + } } - } else { - die("[wgsim_eval] bug in wgsim?\n"); } } else { - die("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + next; } + ++$c0[$q]; ++$c1[$q] unless ($is_correct); - print STDERR $line if (defined($opts{p}) && !$is_correct && $q > 0); + @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]}); + ++$fnfp{$t[4]}[0]; + ++$fnfp{$t[4]}[1] unless ($is_correct); + print STDERR $line if (($flag&1) && !$is_correct && $q > 0); } # print my ($cc0, $cc1) = (0, 0); - for (my $i = $max_q; $i >= 0; --$i) { - $c0[$i] = 0 unless (defined $c0[$i]); - $c1[$i] = 0 unless (defined $c1[$i]); - $cc0 += $c0[$i]; $cc1 += $c1[$i]; - printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + if (!defined($opts{a})) { + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); + } + } else { + for (reverse(sort {$a<=>$b} (keys %fnfp))) { + next if ($_ == 0); + $cc0 += $fnfp{$_}[0]; + $cc1 += $fnfp{$_}[1]; + print join("\t", $_, $cc0, $cc1), "\n"; + } } }