From: Heng Li Date: Thu, 14 Oct 2010 03:32:12 +0000 (+0000) Subject: improve the LD statistics X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=30d422313b3d1c4586c8c4098e3e6c3777dc521e improve the LD statistics --- diff --git a/bcftools/call1.c b/bcftools/call1.c index 68b24f0..2b28452 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -348,7 +348,7 @@ int bcfview(int argc, char *argv[]) kstring_t s; s.m = s.l = 0; s.s = 0; if (*b->info) kputc(';', &s); - ksprintf(&s, "NEIR=%.3lf", r2); + ksprintf(&s, "NEIR=%.3lf;NEIF=%.3lf,%.3lf", r2, f[0]+f[2], f[0]+f[1]); bcf_append_info(b, s.s, s.l); free(s.s); } diff --git a/bcftools/ld.c b/bcftools/ld.c index 01145b8..aa7ec07 100644 --- a/bcftools/ld.c +++ b/bcftools/ld.c @@ -90,7 +90,7 @@ double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) { // calculate r^2 double p[2], q[2], D; p[0] = f[0] + f[1]; q[0] = 1 - p[0]; - p[1] = f[2] + f[3]; q[1] = 1 - p[1]; + p[1] = f[0] + f[2]; q[1] = 1 - p[1]; D = f[0] * f[3] - f[1] * f[2]; r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2); diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index 4caa846..d0b7971 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -104,38 +104,25 @@ sub fillac { } sub ldstats { - my %opts = (s=>0.01); - getopts('ps:', \%opts); - die("Usage: vcfutils.pl ldstats [-s $opts{s}] \n") if (@ARGV == 0 && -t STDIN); - my ($lastchr, $lastpos) = ('', 0); - my @a; - my $is_print = defined($opts{p})? 1 : 0; + my %opts = (t=>0.9); + getopts('t:', \%opts); + die("Usage: vcfutils.pl ldstats [-t $opts{t}] \n") if (@ARGV == 0 && -t STDIN); + my $cutoff = $opts{t}; + my ($last, $lastchr) = (0x7fffffff, ''); + my ($x, $y, $n) = (0, 0, 0); while (<>) { - next if (/^#/); - my @t = split; - if ($t[0] ne $lastchr) { - $lastchr = $t[0]; - } elsif (/NEIR=([\d\.]+)/) { - push(@a, [$t[1] - $lastpos, $1, $t[1]]); - } - $lastpos = $t[1]; - } - my $max = 1000000000; - push(@a, [$max, 0, 0]); # end marker - @a = sort {$a->[0]<=>$b->[0]} @a; - my $next = $opts{s}; - my $last = $a[0]; - my @c = (0, 0, 0, 0); - for my $p (@a) { - print STDERR "$p->[0]\t$p->[1]\t$p->[2]\n" if ($is_print); - if ($p->[0] == $max || ($p->[0] != $last && $c[0]/@a > $next)) { - printf("%d\t%.2f\t%.4f\n", $c[1], $c[2]/$c[1], $c[3]/$c[1]); - $c[1] = $c[2] = $c[3] = 0; - $next = $c[0]/@a + $opts{s}; + if (/^([^#\s]+)\s(\d+)/) { + my ($chr, $pos) = ($1, $2); + if (/NEIR=([\d\.]+)/) { + ++$n; + ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff); + } + $last = $pos; $lastchr = $chr; } - ++$c[0]; ++$c[1]; $c[2] += $p->[0]; $c[3] += $p->[1]; - $last = $p->[0]; } + print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n"; + print "Fraction: ", $y/$n, "\n"; + print "Length: $x\n"; } sub qstats {