From: Heng Li Date: Fri, 12 Nov 2010 01:04:07 +0000 (+0000) Subject: * samtools-0.1.19-11 (r817) X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b0ae0ca2209b3cde8185d6e7062442d2a09e7078;p=samtools.git * samtools-0.1.19-11 (r817) * only initiate indel calling when 0.2% of reads contain a gap --- diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 7713b51..5f4d893 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -12,6 +12,7 @@ KHASH_SET_INIT_STR(rg) #define MINUS_CONST 0x10000000 #define INDEL_WINDOW_SIZE 50 #define MAX_SCORE 90 +#define MIN_SUPPORT_COEF 500 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) { @@ -141,7 +142,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (s == n) return -1; // there is no indel at this position. for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads { // find out how many types of indels are present - int m; + int m, n_alt = 0, n_tot = 0; uint32_t *aux; aux = calloc(N + 1, 4); m = max_rd_len = 0; @@ -149,8 +150,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (s = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i) { const bam_pileup1_t *p = plp[s] + i; - if (p->indel != 0 && (rghash == 0 || p->aux == 0)) - aux[m++] = MINUS_CONST + p->indel; + if (rghash == 0 || p->aux == 0) { + ++n_tot; + if (p->indel != 0) { + ++n_alt; + aux[m++] = MINUS_CONST + p->indel; + } + } j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } @@ -159,7 +165,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1) { // no indels + if (n_types == 1 || n_alt * MIN_SUPPORT_COEF < n_tot) { // no indels or too few supporting reads free(aux); return -1; } types = (int*)calloc(n_types, sizeof(int)); diff --git a/bamtk.c b/bamtk.c index 09d64ad..da29ed7 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.9-10 (r816)" +#define PACKAGE_VERSION "0.1.9-11 (r817)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index c1177ea..1270c6c 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -13,7 +13,8 @@ sub main { &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, - hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats); + hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, + gapstats=>\&gapstats); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@ -313,6 +314,25 @@ sub varFilter_aux { } } +sub gapstats { + my (@c0, @c1); + $c0[$_] = $c1[$_] = 0 for (0 .. 10000); + while (<>) { + next if (/^#/); + my @t = split; + next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel + my @s = split(',', $t[4]); + for my $x (@s) { + my $l = length($x) - length($t[3]) + 5000; + $c0[$l] += 1 / @s; + } + } + for (my $i = 0; $i < 10000; ++$i) { + next if ($c0[$i] == 0); + printf("%d\t%.2f\n", ($i-5000), $c0[$i]); + } +} + sub ucscsnp2vcf { die("Usage: vcfutils.pl \n") if (@ARGV == 0 && -t STDIN); print "##fileformat=VCFv4.0\n";