From: Heng Li Date: Fri, 19 Nov 2010 18:45:51 +0000 (+0000) Subject: * samtools-0.1.10-4 (r840) X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=30004e9139b96449fa34831af2fd8e860567a29d * samtools-0.1.10-4 (r840) * drop the MNP caller. It is slow while does not diliver too much benefit. Possibly I will work on it in future given more time. * there is a segfault in pileup * someone has reported segfault from view/index/sort --- diff --git a/ChangeLog b/ChangeLog index f8456a3..973601e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,91 @@ +------------------------------------------------------------------------ +r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines +Changed paths: + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-6 (r839) + + * call MNPs without realignment because it seems to me that it is not + worthwhile to significantly slow down SNP calling. + + * the result looks quite different from the previous version. I have + work to do... + + +------------------------------------------------------------------------ +r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +Apply a patch by Rob Davis, which improves fault detection. + +------------------------------------------------------------------------ +r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-r836 + * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow) + +------------------------------------------------------------------------ +r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + + * modify the filtering rule: also filter SNPs around filtered indels + * added MNP filter + +------------------------------------------------------------------------ +r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-4 (r834) + * fixed a silly bug in printing MNP + * restrict to at most 1 alternative allele + +------------------------------------------------------------------------ +r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + +fixed a bug in printing MNPs + +------------------------------------------------------------------------ +r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + +minor change to how seqQ is applied + +------------------------------------------------------------------------ +r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10 (r831) + * initial MNP caller + +------------------------------------------------------------------------ +r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.10 (r829) + ------------------------------------------------------------------------ r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines Changed paths: diff --git a/bam2bcf.c b/bam2bcf.c index 45a0ccc..088635c 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -22,7 +22,6 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->capQ = 60; bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; - bca->last_mnp_pos = -100; bca->e = errmod_init(1. - theta); return bca; } @@ -33,10 +32,8 @@ void bcf_call_destroy(bcf_callaux_t *bca) errmod_destroy(bca->e); free(bca->bases); free(bca->inscns); free(bca); } -/* Compute genotype likelihood by combining likelihood of each - * read. ref_base is the 4-bit representation of the reference base. It - * is negative if we are looking at an indel or an MNP. Indel and MNP - * are indistinguishable in this function. */ +/* ref_base is the 4-bit representation of the reference base. It is + * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { int i, n, ref4, is_indel, ori_depth = 0; @@ -95,9 +92,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t errmod_cal(bca->e, n, 5, bca->bases, r->p); return r->depth; } -/* Combine individual calls. ref_base is a 4-bit A/C/G/T for a SNP, or - * B2B_REF_MNP, or B2B_REF_INDEL. Indel and MNP are indistinguishable in - * this function. */ + int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; @@ -105,7 +100,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, if (ref_base >= 0) { call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base]; if (ref4 > 4) ref4 = 4; - } else call->ori_ref = ref_base, ref4 = 0; // ref_base < 0 + } else call->ori_ref = -1, ref4 = 0; // calculate qsum memset(qsum, 0, 4 * sizeof(int)); for (i = 0; i < n; ++i) @@ -130,7 +125,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) call->unseen = j, call->a[j++] = qsum[i]&3; call->n_alleles = j; - } else { // for indels and MNPs + } else { call->n_alleles = j; if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything } @@ -173,8 +168,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, } return 0; } -/* Fill a bcf1_t record. The type of the record (SNP, MNP or INDEL) is - * determined by bca->ori_ref. */ + int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, const bcf_callaux_t *bca, const char *ref) { @@ -185,7 +179,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc b->tid = tid; b->pos = pos; b->qual = 0; s.s = b->str; s.m = b->m_str; s.l = 0; kputc('\0', &s); - if (bc->ori_ref == B2B_REF_INDEL) { // an indel + if (bc->ori_ref < 0) { // an indel // write REF kputc(ref[pos], &s); for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s); @@ -208,16 +202,6 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc } } kputc('\0', &s); - } else if (bc->ori_ref == B2B_REF_MNP) { - for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+j], &s); - kputc('\0', &s); - for (i = 1; i < 4; ++i) { - if (bc->a[i] < 0) break; - if (i > 1) kputc(',', &s); - for (j = 0; j < bca->indelreg; ++j) - kputc("ACGT"[bca->indel_types[i]>>2*j&3], &s); - } - kputc('\0', &s); } else { // a SNP kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s); for (i = 1; i < 5; ++i) { @@ -229,8 +213,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc } kputc('\0', &s); // INFO - if (bc->ori_ref == B2B_REF_INDEL) kputs("INDEL;", &s); - else if (bc->ori_ref == B2B_REF_MNP) kputs("MNP;", &s); + if (bc->ori_ref < 0) kputs("INDEL;", &s); kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); for (i = 0; i < 16; ++i) { if (i) kputc(',', &s); diff --git a/bam2bcf.h b/bam2bcf.h index 4ea4809..26b022c 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -5,11 +5,7 @@ #include "errmod.h" #include "bcftools/bcf.h" -#define B2B_INDEL_NULL 0x7fff -#define B2B_MAX_MNP 4 // cannot be larger than 4!!! -#define B2B_MNP_WIN 10 -#define B2B_REF_INDEL (-1) -#define B2B_REF_MNP (-2) +#define B2B_INDEL_NULL 10000 typedef struct __bcf_callaux_t { int capQ, min_baseQ; @@ -17,7 +13,7 @@ typedef struct __bcf_callaux_t { // for internal uses int max_bases; int indel_types[4]; - int maxins, indelreg, last_mnp_pos; + int maxins, indelreg; char *inscns; uint16_t *bases; errmod_t *e; @@ -32,7 +28,7 @@ typedef struct { typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 - int n, n_alleles, shift, ori_ref, unseen; // ori_ref can be B2B_REF_INDEL/B2B_REF_MNP + int n, n_alleles, shift, ori_ref, unseen; int anno[16], depth, ori_depth; uint8_t *PL; } bcf_call_t; @@ -49,7 +45,6 @@ extern "C" { const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash); - int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); #ifdef __cplusplus } diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 4f747b1..819c1e7 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -408,153 +408,3 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(types); free(inscns); return n_alt > 0? 0 : -1; } - -#define END_SKIP 4 // must be larger than B2B_MAX_MNP -#define MIN_MNP_FLANK_BAQ 20 - -int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) -{ - extern void ks_introsort_uint32_t(int, uint32_t*); - int i, s, j, k, t, n_types, *types; - int N, ref_seq, ref_type; - if (ref == 0 || bca == 0) return -1; - if (pos < bca->last_mnp_pos + B2B_MNP_WIN) return -2; // to avoid calling a TNP multiple times - if (pos < END_SKIP || ref[pos] == 0 || ref[pos+1] == 0) return -2; // end of the reference - { // determine if there is an MNP - int r[2]; - uint8_t *tt; - r[0] = bam_nt16_table[(int)ref[pos]]; - r[1] = bam_nt16_table[(int)ref[pos+1]]; - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int left, rght; - if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue; - tt = bam1_seq(p->b); - if (bam1_seqi(tt, p->qpos) == r[0] || bam1_seqi(tt, p->qpos+1) == r[1]) continue; // no MNP - tt = bam1_qual(p->b); - for (left = 0, k = p->qpos - END_SKIP; k <= p->qpos; ++k) - left = left > tt[k]? left : tt[k]; - for (rght = 0, k = p->qpos; k < p->b->core.l_qseq - END_SKIP; ++k) - rght = rght > tt[k]? left : tt[k]; - if (left >= MIN_MNP_FLANK_BAQ && rght >= MIN_MNP_FLANK_BAQ) break; // bracketed by good bases - } - if (i != n_plp[s]) break; - } - if (s == n) return -1; // there is no MNP at this position. - } - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // construct the MNP consensus - uint8_t *tt; - uint32_t *aux, x, *cnt; - int m = 0; - aux = calloc(N + 1, 4); - for (i = 0, x = 0; i < B2B_MAX_MNP; ++i) - x |= bam_nt16_nt4_table[bam_nt16_table[(int)ref[pos+i]]] << 2*i; - ref_seq = x; - bca->indelreg = 0; - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int stop; - if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue; - tt = bam1_seq(p->b); - for (k = j = stop = 0, x = 0; k < B2B_MAX_MNP; ++k) { - int c = bam_nt16_nt4_table[bam1_seqi(tt, p->qpos + k)]; - if (c > 3) break; - if (c != (ref_seq>>k*2&3) && !stop) ++j; - else stop = 1; - x |= c << 2*k; - } - if (k == B2B_MAX_MNP && j >= 2) aux[m++] = x; - } - } - if (m == 0) { - free(aux); return -1; - } - ks_introsort(uint32_t, m, aux); - // squeeze out indentical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // count reads for each type - cnt = alloca(n_types * 4); - cnt[0] = 1<<8 | aux[0]; - for (i = 1, t = 0; i < m; ++i) { - if (aux[i] != aux[i-1]) { - ++t; - cnt[t] = 1<<8 | aux[i]; - } else cnt[t] += 1<<8; - } - free(aux); - // collect types; NOTE: we only collect ONE alternative type - ks_introsort(uint32_t, n_types, cnt); - if ((cnt[n_types-1]>>8) * MIN_SUPPORT_COEF < N) // no MNPs or too few supporting reads - return -1; - types = (int*)calloc(2, sizeof(int)); - bca->indel_types[0] = types[0] = ref_seq; - bca->indel_types[1] = types[1] = cnt[n_types-1]&0xff; - bca->indel_types[2] = bca->indel_types[3] = B2B_INDEL_NULL; - ref_type = 0; n_types = 2; - // calculate MNP length - for (i = 0; i < B2B_MAX_MNP; ++i) { - int c = types[0] >> 2*i & 3; - for (t = 1; t < n_types; ++t) - if ((types[t] >> 2*i & 3) != c) break; - if (t == n_types) break; - } - bca->indelreg = i; - if (bca->indelreg < 2) return -1; // should not happen in principle - x = (1<<2*bca->indelreg) - 1; - for (t = 0; t < n_types; ++t) types[t] &= x; - ref_seq &= x; - } - { // calling - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - uint8_t *qq, *bq, *seq = bam1_seq(p->b); - uint32_t x, *cigar = bam1_cigar(p->b); - int y, skip = 0, baseQ = 0, baq = 0; - // try to get the original base quality - qq = bam1_qual(p->b); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; - // get the max original base quality - for (j = p->qpos, x = 0; j < p->b->core.l_qseq && j < p->qpos + bca->indelreg; ++j) { - int q = bq? qq[j] + (bq[j] - 64) : qq[j]; - int c = bam_nt16_nt4_table[bam1_seqi(seq, j)]; - baseQ = baseQ > q? baseQ : q; - if (c > 3) skip = 1; - x |= c << 2 * (j - p->qpos); - } - for (t = 0; t < n_types; ++t) - if (types[t] == x) break; - if (t == n_types) skip = 1; - else { // reapply BAQ - for (k = y = 0; k < p->b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int len = cigar[k]>>4; - if (op == BAM_CMATCH) { - if (p->qpos >= y && p->qpos + bca->indelreg <= y + len) { - int left, rght; - for (j = y, left = 0; j <= p->qpos; ++j) - left = left > qq[j]? left : qq[j]; - for (j = p->qpos + bca->indelreg - 1, rght = 0; j < y + len; ++j) - rght = rght > qq[j]? rght : qq[j]; - baq = left < rght? left : rght; - break; - } - y += len; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += len; - } - } -// fprintf(stderr, "pos=%d read=%d:%d name=%s skip=%d reg=%d call=%d baseQ=%d baq=%d\n", pos+1, s, i, bam1_qname(p->b), skip, bca->indelreg, t, baseQ, baq); - baseQ = baseQ < baq? baseQ : baq; - p->aux = skip? 0 : (t<<16|baseQ<<8|baseQ); - } - } - } - // free - free(types); - return 0; -} diff --git a/bam_plcmd.c b/bam_plcmd.c index 7f13eee..e3f73aa 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -738,31 +738,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); - if (!(conf->flag&MPLP_NO_INDEL)) { - // call MNPs - if (bcf_call_mnp_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref) >= 0) { - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_MNP, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, B2B_REF_MNP, &bc) >= 0) { - b = calloc(1, sizeof(bcf1_t)); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), bca, ref); - bcf_write(bp, bh, b); - bcf_destroy(b); - bca->last_mnp_pos = pos; - } - } - // call indels - if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_INDEL, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, B2B_REF_INDEL, &bc) >= 0) { - b = calloc(1, sizeof(bcf1_t)); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), bca, ref); - bcf_write(bp, bh, b); - bcf_destroy(b); - } + // call indels + if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + for (i = 0; i < gplp.n; ++i) + bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); + if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + b = calloc(1, sizeof(bcf1_t)); + bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, + (conf->flag&MPLP_FMT_SP), bca, ref); + bcf_write(bp, bh, b); + bcf_destroy(b); } } } else { diff --git a/bamtk.c b/bamtk.c index 1d0a758..f3b09d6 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.10-6 (r839)" +#define PACKAGE_VERSION "0.1.10-7 (r840)" #endif int bam_taf2baf(int argc, char *argv[]);