#!/usr/bin/perl -w
# Contact: lh3
-# Version: 0.1.2
+# Version: 0.1.5
use strict;
use warnings;
exit;
sub wgsim_eval {
- my %opts;
- getopts('p', \%opts);
- die("Usage: wgsim_eval.pl [-p] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
- my (@c0, @c1);
- my $max_q = 0;
+ my %opts = (g=>5);
+ getopts('pcag:', \%opts);
+ die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my (@c0, @c1, %fnfp);
+ my ($max_q, $flag) = (0, 0);
+ my $gap = $opts{g};
+ $flag |= 1 if (defined $opts{p});
+ $flag |= 2 if (defined $opts{c});
while (<>) {
- my @t = split;
+ next if (/^\@/);
+ my @t = split("\t");
+ next if (@t < 11);
my $line = $_;
my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]);
$max_q = $q if ($q > $max_q);
$_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
--$rght;
# correct for soft clipping
- $left -= $1 if (/^(\d+)S/);
- $rght += $1 if (/(\d+)S$/);
+ my ($left0, $rght0) = ($left, $rght);
+ $left -= $1 if (/^(\d+)[SH]/);
+ $rght += $1 if (/(\d+)[SH]$/);
+ $left0 -= $1 if (/(\d+)[SH]$/);
+ $rght0 += $1 if (/^(\d+)[SH]/);
# skip unmapped reads
next if (($t[1]&0x4) || $chr eq '*');
# parse read name and check
- ++$c0[$q];
if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) {
if ($1 ne $chr) { # different chr
$is_correct = 0;
- } elsif ($2 < $3) { # should always be true if reads are generated from wgsim
- if ($t[1] & 0x10) { # reverse
- $is_correct = 0 if (abs($3 - $rght) > 5); # in case of indels that are close to the end of a reads
+ } else {
+ if ($flag & 2) {
+ if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
+ } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap);
+ } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward
+ $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap);
+ } else { # R3, reverse
+ $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap);
+ }
} else {
- $is_correct = 0 if (abs($2 - $left) > 5);
+ if ($t[1] & 0x10) { # reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads
+ } else {
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
+ }
}
- } else {
- die("[wgsim_eval] bug in wgsim?\n");
}
} else {
- die("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n");
+ warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n");
+ next;
}
+ ++$c0[$q];
++$c1[$q] unless ($is_correct);
- print STDERR $line if (defined($opts{p}) && !$is_correct && $q > 0);
+ @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]});
+ ++$fnfp{$t[4]}[0];
+ ++$fnfp{$t[4]}[1] unless ($is_correct);
+ print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
}
# print
my ($cc0, $cc1) = (0, 0);
- for (my $i = $max_q; $i >= 0; --$i) {
- $c0[$i] = 0 unless (defined $c0[$i]);
- $c1[$i] = 0 unless (defined $c1[$i]);
- $cc0 += $c0[$i]; $cc1 += $c1[$i];
- printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0);
+ if (!defined($opts{a})) {
+ for (my $i = $max_q; $i >= 0; --$i) {
+ $c0[$i] = 0 unless (defined $c0[$i]);
+ $c1[$i] = 0 unless (defined $c1[$i]);
+ $cc0 += $c0[$i]; $cc1 += $c1[$i];
+ printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0);
+ }
+ } else {
+ for (reverse(sort {$a<=>$b} (keys %fnfp))) {
+ next if ($_ == 0);
+ $cc0 += $fnfp{$_}[0];
+ $cc1 += $fnfp{$_}[1];
+ print join("\t", $_, $cc0, $cc1), "\n";
+ }
}
}