+------------------------------------------------------------------------
+r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-6 (r839)
+
+ * call MNPs without realignment because it seems to me that it is not
+ worthwhile to significantly slow down SNP calling.
+
+ * the result looks quite different from the previous version. I have
+ work to do...
+
+
+------------------------------------------------------------------------
+r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+Apply a patch by Rob Davis, which improves fault detection.
+
+------------------------------------------------------------------------
+r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-r836
+ * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow)
+
+------------------------------------------------------------------------
+r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * modify the filtering rule: also filter SNPs around filtered indels
+ * added MNP filter
+
+------------------------------------------------------------------------
+r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r834)
+ * fixed a silly bug in printing MNP
+ * restrict to at most 1 alternative allele
+
+------------------------------------------------------------------------
+r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bamtk.c
+
+fixed a bug in printing MNPs
+
+------------------------------------------------------------------------
+r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+minor change to how seqQ is applied
+
+------------------------------------------------------------------------
+r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r831)
+ * initial MNP caller
+
+------------------------------------------------------------------------
+r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.10 (r829)
+
------------------------------------------------------------------------
r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines
Changed paths:
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
- bca->last_mnp_pos = -100;
bca->e = errmod_init(1. - theta);
return bca;
}
errmod_destroy(bca->e);
free(bca->bases); free(bca->inscns); free(bca);
}
-/* Compute genotype likelihood by combining likelihood of each
- * read. ref_base is the 4-bit representation of the reference base. It
- * is negative if we are looking at an indel or an MNP. Indel and MNP
- * are indistinguishable in this function. */
+/* ref_base is the 4-bit representation of the reference base. It is
+ * negative if we are looking at an indel. */
int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
{
int i, n, ref4, is_indel, ori_depth = 0;
errmod_cal(bca->e, n, 5, bca->bases, r->p);
return r->depth;
}
-/* Combine individual calls. ref_base is a 4-bit A/C/G/T for a SNP, or
- * B2B_REF_MNP, or B2B_REF_INDEL. Indel and MNP are indistinguishable in
- * this function. */
+
int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
{
int ref4, i, j, qsum[4];
if (ref_base >= 0) {
call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
if (ref4 > 4) ref4 = 4;
- } else call->ori_ref = ref_base, ref4 = 0; // ref_base < 0
+ } else call->ori_ref = -1, ref4 = 0;
// calculate qsum
memset(qsum, 0, 4 * sizeof(int));
for (i = 0; i < n; ++i)
if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
call->unseen = j, call->a[j++] = qsum[i]&3;
call->n_alleles = j;
- } else { // for indels and MNPs
+ } else {
call->n_alleles = j;
if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
}
}
return 0;
}
-/* Fill a bcf1_t record. The type of the record (SNP, MNP or INDEL) is
- * determined by bca->ori_ref. */
+
int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP,
const bcf_callaux_t *bca, const char *ref)
{
b->tid = tid; b->pos = pos; b->qual = 0;
s.s = b->str; s.m = b->m_str; s.l = 0;
kputc('\0', &s);
- if (bc->ori_ref == B2B_REF_INDEL) { // an indel
+ if (bc->ori_ref < 0) { // an indel
// write REF
kputc(ref[pos], &s);
for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
}
}
kputc('\0', &s);
- } else if (bc->ori_ref == B2B_REF_MNP) {
- for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+j], &s);
- kputc('\0', &s);
- for (i = 1; i < 4; ++i) {
- if (bc->a[i] < 0) break;
- if (i > 1) kputc(',', &s);
- for (j = 0; j < bca->indelreg; ++j)
- kputc("ACGT"[bca->indel_types[i]>>2*j&3], &s);
- }
- kputc('\0', &s);
} else { // a SNP
kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
for (i = 1; i < 5; ++i) {
}
kputc('\0', &s);
// INFO
- if (bc->ori_ref == B2B_REF_INDEL) kputs("INDEL;", &s);
- else if (bc->ori_ref == B2B_REF_MNP) kputs("MNP;", &s);
+ if (bc->ori_ref < 0) kputs("INDEL;", &s);
kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
for (i = 0; i < 16; ++i) {
if (i) kputc(',', &s);
#include "errmod.h"
#include "bcftools/bcf.h"
-#define B2B_INDEL_NULL 0x7fff
-#define B2B_MAX_MNP 4 // cannot be larger than 4!!!
-#define B2B_MNP_WIN 10
-#define B2B_REF_INDEL (-1)
-#define B2B_REF_MNP (-2)
+#define B2B_INDEL_NULL 10000
typedef struct __bcf_callaux_t {
int capQ, min_baseQ;
// for internal uses
int max_bases;
int indel_types[4];
- int maxins, indelreg, last_mnp_pos;
+ int maxins, indelreg;
char *inscns;
uint16_t *bases;
errmod_t *e;
typedef struct {
int a[5]; // alleles: ref, alt, alt2, alt3
- int n, n_alleles, shift, ori_ref, unseen; // ori_ref can be B2B_REF_INDEL/B2B_REF_MNP
+ int n, n_alleles, shift, ori_ref, unseen;
int anno[16], depth, ori_depth;
uint8_t *PL;
} bcf_call_t;
const bcf_callaux_t *bca, const char *ref);
int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
const void *rghash);
- int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
#ifdef __cplusplus
}
free(types); free(inscns);
return n_alt > 0? 0 : -1;
}
-
-#define END_SKIP 4 // must be larger than B2B_MAX_MNP
-#define MIN_MNP_FLANK_BAQ 20
-
-int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
-{
- extern void ks_introsort_uint32_t(int, uint32_t*);
- int i, s, j, k, t, n_types, *types;
- int N, ref_seq, ref_type;
- if (ref == 0 || bca == 0) return -1;
- if (pos < bca->last_mnp_pos + B2B_MNP_WIN) return -2; // to avoid calling a TNP multiple times
- if (pos < END_SKIP || ref[pos] == 0 || ref[pos+1] == 0) return -2; // end of the reference
- { // determine if there is an MNP
- int r[2];
- uint8_t *tt;
- r[0] = bam_nt16_table[(int)ref[pos]];
- r[1] = bam_nt16_table[(int)ref[pos+1]];
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- int left, rght;
- if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue;
- tt = bam1_seq(p->b);
- if (bam1_seqi(tt, p->qpos) == r[0] || bam1_seqi(tt, p->qpos+1) == r[1]) continue; // no MNP
- tt = bam1_qual(p->b);
- for (left = 0, k = p->qpos - END_SKIP; k <= p->qpos; ++k)
- left = left > tt[k]? left : tt[k];
- for (rght = 0, k = p->qpos; k < p->b->core.l_qseq - END_SKIP; ++k)
- rght = rght > tt[k]? left : tt[k];
- if (left >= MIN_MNP_FLANK_BAQ && rght >= MIN_MNP_FLANK_BAQ) break; // bracketed by good bases
- }
- if (i != n_plp[s]) break;
- }
- if (s == n) return -1; // there is no MNP at this position.
- }
- for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
- { // construct the MNP consensus
- uint8_t *tt;
- uint32_t *aux, x, *cnt;
- int m = 0;
- aux = calloc(N + 1, 4);
- for (i = 0, x = 0; i < B2B_MAX_MNP; ++i)
- x |= bam_nt16_nt4_table[bam_nt16_table[(int)ref[pos+i]]] << 2*i;
- ref_seq = x;
- bca->indelreg = 0;
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- int stop;
- if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue;
- tt = bam1_seq(p->b);
- for (k = j = stop = 0, x = 0; k < B2B_MAX_MNP; ++k) {
- int c = bam_nt16_nt4_table[bam1_seqi(tt, p->qpos + k)];
- if (c > 3) break;
- if (c != (ref_seq>>k*2&3) && !stop) ++j;
- else stop = 1;
- x |= c << 2*k;
- }
- if (k == B2B_MAX_MNP && j >= 2) aux[m++] = x;
- }
- }
- if (m == 0) {
- free(aux); return -1;
- }
- ks_introsort(uint32_t, m, aux);
- // squeeze out indentical types
- for (i = 1, n_types = 1; i < m; ++i)
- if (aux[i] != aux[i-1]) ++n_types;
- // count reads for each type
- cnt = alloca(n_types * 4);
- cnt[0] = 1<<8 | aux[0];
- for (i = 1, t = 0; i < m; ++i) {
- if (aux[i] != aux[i-1]) {
- ++t;
- cnt[t] = 1<<8 | aux[i];
- } else cnt[t] += 1<<8;
- }
- free(aux);
- // collect types; NOTE: we only collect ONE alternative type
- ks_introsort(uint32_t, n_types, cnt);
- if ((cnt[n_types-1]>>8) * MIN_SUPPORT_COEF < N) // no MNPs or too few supporting reads
- return -1;
- types = (int*)calloc(2, sizeof(int));
- bca->indel_types[0] = types[0] = ref_seq;
- bca->indel_types[1] = types[1] = cnt[n_types-1]&0xff;
- bca->indel_types[2] = bca->indel_types[3] = B2B_INDEL_NULL;
- ref_type = 0; n_types = 2;
- // calculate MNP length
- for (i = 0; i < B2B_MAX_MNP; ++i) {
- int c = types[0] >> 2*i & 3;
- for (t = 1; t < n_types; ++t)
- if ((types[t] >> 2*i & 3) != c) break;
- if (t == n_types) break;
- }
- bca->indelreg = i;
- if (bca->indelreg < 2) return -1; // should not happen in principle
- x = (1<<2*bca->indelreg) - 1;
- for (t = 0; t < n_types; ++t) types[t] &= x;
- ref_seq &= x;
- }
- { // calling
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- uint8_t *qq, *bq, *seq = bam1_seq(p->b);
- uint32_t x, *cigar = bam1_cigar(p->b);
- int y, skip = 0, baseQ = 0, baq = 0;
- // try to get the original base quality
- qq = bam1_qual(p->b);
- bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
- if (bq) ++bq;
- // get the max original base quality
- for (j = p->qpos, x = 0; j < p->b->core.l_qseq && j < p->qpos + bca->indelreg; ++j) {
- int q = bq? qq[j] + (bq[j] - 64) : qq[j];
- int c = bam_nt16_nt4_table[bam1_seqi(seq, j)];
- baseQ = baseQ > q? baseQ : q;
- if (c > 3) skip = 1;
- x |= c << 2 * (j - p->qpos);
- }
- for (t = 0; t < n_types; ++t)
- if (types[t] == x) break;
- if (t == n_types) skip = 1;
- else { // reapply BAQ
- for (k = y = 0; k < p->b->core.n_cigar; ++k) {
- int op = cigar[k]&0xf;
- int len = cigar[k]>>4;
- if (op == BAM_CMATCH) {
- if (p->qpos >= y && p->qpos + bca->indelreg <= y + len) {
- int left, rght;
- for (j = y, left = 0; j <= p->qpos; ++j)
- left = left > qq[j]? left : qq[j];
- for (j = p->qpos + bca->indelreg - 1, rght = 0; j < y + len; ++j)
- rght = rght > qq[j]? rght : qq[j];
- baq = left < rght? left : rght;
- break;
- }
- y += len;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += len;
- }
- }
-// fprintf(stderr, "pos=%d read=%d:%d name=%s skip=%d reg=%d call=%d baseQ=%d baq=%d\n", pos+1, s, i, bam1_qname(p->b), skip, bca->indelreg, t, baseQ, baq);
- baseQ = baseQ < baq? baseQ : baq;
- p->aux = skip? 0 : (t<<16|baseQ<<8|baseQ);
- }
- }
- }
- // free
- free(types);
- return 0;
-}
(conf->flag&MPLP_FMT_SP), 0, 0);
bcf_write(bp, bh, b);
bcf_destroy(b);
- if (!(conf->flag&MPLP_NO_INDEL)) {
- // call MNPs
- if (bcf_call_mnp_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref) >= 0) {
- for (i = 0; i < gplp.n; ++i)
- bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_MNP, bca, bcr + i);
- if (bcf_call_combine(gplp.n, bcr, B2B_REF_MNP, &bc) >= 0) {
- b = calloc(1, sizeof(bcf1_t));
- bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
- (conf->flag&MPLP_FMT_SP), bca, ref);
- bcf_write(bp, bh, b);
- bcf_destroy(b);
- bca->last_mnp_pos = pos;
- }
- }
- // call indels
- if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
- for (i = 0; i < gplp.n; ++i)
- bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_INDEL, bca, bcr + i);
- if (bcf_call_combine(gplp.n, bcr, B2B_REF_INDEL, &bc) >= 0) {
- b = calloc(1, sizeof(bcf1_t));
- bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
- (conf->flag&MPLP_FMT_SP), bca, ref);
- bcf_write(bp, bh, b);
- bcf_destroy(b);
- }
+ // call indels
+ if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+ for (i = 0; i < gplp.n; ++i)
+ bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
+ if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
+ b = calloc(1, sizeof(bcf1_t));
+ bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+ (conf->flag&MPLP_FMT_SP), bca, ref);
+ bcf_write(bp, bh, b);
+ bcf_destroy(b);
}
}
} else {
#endif
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.10-6 (r839)"
+#define PACKAGE_VERSION "0.1.10-7 (r840)"
#endif
int bam_taf2baf(int argc, char *argv[]);