From: Petr Danecek Date: Wed, 27 Jun 2012 05:33:29 +0000 (+0100) Subject: New -p switch for increased sensititivy of indel-calling. Here -m and -F are used... X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=700d451b57c06a5fc842b24be0d69d4a251c91df New -p switch for increased sensititivy of indel-calling. Here -m and -F are used per-sample which enables to detect rare indels in low-coverage bams. Experimental. --- diff --git a/bam2bcf.c b/bam2bcf.c index 6ac5dce..502d832 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -26,6 +26,7 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; + bca->per_sample_flt = 0; return bca; } @@ -54,7 +55,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array - memset(r, 0, sizeof(bcf_callret1_t)); for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; diff --git a/bam2bcf.h b/bam2bcf.h index a4d8ca5..a31ac9d 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -16,6 +16,7 @@ typedef struct __bcf_callaux_t { int openQ, extQ, tandemQ; // for indels int min_support; // for collecting indel candidates double min_frac; // for collecting indel candidates + int per_sample_flt; // indel filtering strategy // for internal uses int max_bases; int indel_types[4]; diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index ab9e83c..87736c3 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -142,35 +142,43 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (s == n) return -1; // there is no indel at this position. for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads { // find out how many types of indels are present - int m, n_alt = 0, n_tot = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; uint32_t *aux; aux = calloc(N + 1, 4); m = max_rd_len = 0; aux[m++] = MINUS_CONST; // zero indel is always a type for (s = 0; s < n; ++s) { + int na = 0, nt = 0; for (i = 0; i < n_plp[s]; ++i) { const bam_pileup1_t *p = plp[s] + i; if (rghash == 0 || p->aux == 0) { - ++n_tot; + ++nt; if (p->indel != 0) { - ++n_alt; + ++na; aux[m++] = MINUS_CONST + p->indel; } } j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } + if ( !indel_support_ok && na >= bca->min_support && (double)na/nt >= bca->min_frac ) + indel_support_ok = 1; + n_alt += na; + n_tot += nt; } // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), // check the number of N's in the sequence and skip places where half or more reference bases are Ns. int nN=0; for (i=pos; i-posi ) return -1; + if ( nN*2>i ) { free(aux); return -1; } ks_introsort(uint32_t, m, aux); // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip + // Taking totals makes hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip free(aux); return -1; } if (n_types >= 64) { diff --git a/bam_plcmd.c b/bam_plcmd.c index 07f0a4f..ed00d2e 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -72,6 +72,7 @@ static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, cons #define MPLP_IGNORE_RG 0x2000 #define MPLP_PRINT_POS 0x4000 #define MPLP_PRINT_MAPQ 0x8000 +#define MPLP_PER_SAMPLE 0x10000 void *bed_read(const char *fn); void bed_destroy(void *_h); @@ -271,6 +272,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; + bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); @@ -449,7 +451,7 @@ int bam_mpileup(int argc, char *argv[]) mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6OsV")) >= 0) { + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -459,6 +461,7 @@ int bam_mpileup(int argc, char *argv[]) case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; @@ -532,6 +535,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");