]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.10-4 (r840)
authorHeng Li <lh3@live.co.uk>
Fri, 19 Nov 2010 18:45:51 +0000 (18:45 +0000)
committerHeng Li <lh3@live.co.uk>
Fri, 19 Nov 2010 18:45:51 +0000 (18:45 +0000)
 * drop the MNP caller. It is slow while does not diliver too much
   benefit. Possibly I will work on it in future given more time.

 * there is a segfault in pileup

 * someone has reported segfault from view/index/sort

ChangeLog
bam2bcf.c
bam2bcf.h
bam2bcf_indel.c
bam_plcmd.c
bamtk.c

index f8456a3e61f1e731343c53fbb0aa859751332282..973601e0cbd4498412ab7ce9e8a1817ac88cb939 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,91 @@
+------------------------------------------------------------------------
+r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf.h
+   M /trunk/samtools/bam2bcf_indel.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-6 (r839)
+
+ * call MNPs without realignment because it seems to me that it is not
+   worthwhile to significantly slow down SNP calling.
+
+ * the result looks quite different from the previous version. I have
+   work to do...
+
+
+------------------------------------------------------------------------
+r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/knetfile.c
+
+Apply a patch by Rob Davis, which improves fault detection.
+
+------------------------------------------------------------------------
+r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf_indel.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-r836
+ * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow)
+
+------------------------------------------------------------------------
+r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bcftools/vcfutils.pl
+
+ * modify the filtering rule: also filter SNPs around filtered indels
+ * added MNP filter
+
+------------------------------------------------------------------------
+r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf.c
+   M /trunk/samtools/bam2bcf_indel.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r834)
+ * fixed a silly bug in printing MNP
+ * restrict to at most 1 alternative allele
+
+------------------------------------------------------------------------
+r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf.c
+   M /trunk/samtools/bamtk.c
+
+fixed a bug in printing MNPs
+
+------------------------------------------------------------------------
+r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf_indel.c
+   M /trunk/samtools/bamtk.c
+
+minor change to how seqQ is applied
+
+------------------------------------------------------------------------
+r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines
+Changed paths:
+   M /trunk/samtools/bam2bcf.c
+   M /trunk/samtools/bam2bcf.h
+   M /trunk/samtools/bam2bcf_indel.c
+   M /trunk/samtools/bam_plcmd.c
+   M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r831)
+ * initial MNP caller
+
+------------------------------------------------------------------------
+r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+   M /trunk/samtools/ChangeLog
+   M /trunk/samtools/NEWS
+   M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.10 (r829)
+
 ------------------------------------------------------------------------
 r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines
 Changed paths:
index 45a0ccc36592edbcd8647e731a6476900c4b2c31..088635c0b04a549b762d1a3b28efad82f4bb5c80 100644 (file)
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -22,7 +22,6 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
        bca->capQ = 60;
        bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
        bca->min_baseQ = min_baseQ;
-       bca->last_mnp_pos = -100;
        bca->e = errmod_init(1. - theta);
        return bca;
 }
@@ -33,10 +32,8 @@ void bcf_call_destroy(bcf_callaux_t *bca)
        errmod_destroy(bca->e);
        free(bca->bases); free(bca->inscns); free(bca);
 }
-/* Compute genotype likelihood by combining likelihood of each
- * read. ref_base is the 4-bit representation of the reference base. It
- * is negative if we are looking at an indel or an MNP. Indel and MNP
- * are indistinguishable in this function. */
+/* ref_base is the 4-bit representation of the reference base. It is
+ * negative if we are looking at an indel. */
 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
 {
        int i, n, ref4, is_indel, ori_depth = 0;
@@ -95,9 +92,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
        errmod_cal(bca->e, n, 5, bca->bases, r->p);
        return r->depth;
 }
-/* Combine individual calls. ref_base is a 4-bit A/C/G/T for a SNP, or
- * B2B_REF_MNP, or B2B_REF_INDEL. Indel and MNP are indistinguishable in
- * this function. */
+
 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
 {
        int ref4, i, j, qsum[4];
@@ -105,7 +100,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
        if (ref_base >= 0) {
                call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
                if (ref4 > 4) ref4 = 4;
-       } else call->ori_ref = ref_base, ref4 = 0; // ref_base < 0
+       } else call->ori_ref = -1, ref4 = 0;
        // calculate qsum
        memset(qsum, 0, 4 * sizeof(int));
        for (i = 0; i < n; ++i)
@@ -130,7 +125,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
                if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
                        call->unseen = j, call->a[j++] = qsum[i]&3;
                call->n_alleles = j;
-       } else { // for indels and MNPs
+       } else {
                call->n_alleles = j;
                if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
        }
@@ -173,8 +168,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
        }
        return 0;
 }
-/* Fill a bcf1_t record. The type of the record (SNP, MNP or INDEL) is
- * determined by bca->ori_ref. */
+
 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP,
                                 const bcf_callaux_t *bca, const char *ref)
 {
@@ -185,7 +179,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
        b->tid = tid; b->pos = pos; b->qual = 0;
        s.s = b->str; s.m = b->m_str; s.l = 0;
        kputc('\0', &s);
-       if (bc->ori_ref == B2B_REF_INDEL) { // an indel
+       if (bc->ori_ref < 0) { // an indel
                // write REF
                kputc(ref[pos], &s);
                for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
@@ -208,16 +202,6 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
                        }
                }
                kputc('\0', &s);
-       } else if (bc->ori_ref == B2B_REF_MNP) {
-               for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+j], &s);
-               kputc('\0', &s);
-               for (i = 1; i < 4; ++i) {
-                       if (bc->a[i] < 0) break;
-                       if (i > 1) kputc(',', &s);
-                       for (j = 0; j < bca->indelreg; ++j)
-                               kputc("ACGT"[bca->indel_types[i]>>2*j&3], &s);
-               }
-               kputc('\0', &s);
        } else { // a SNP
                kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
                for (i = 1; i < 5; ++i) {
@@ -229,8 +213,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
        }
        kputc('\0', &s);
        // INFO
-       if (bc->ori_ref == B2B_REF_INDEL) kputs("INDEL;", &s);
-       else if (bc->ori_ref == B2B_REF_MNP) kputs("MNP;", &s);
+       if (bc->ori_ref < 0) kputs("INDEL;", &s);
        kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
        for (i = 0; i < 16; ++i) {
                if (i) kputc(',', &s);
index 4ea48095e3566ed1d55fa96bdcf873b4ba0da83f..26b022c08e89e3597bd6e400690d42bd7ecabe38 100644 (file)
--- a/bam2bcf.h
+++ b/bam2bcf.h
@@ -5,11 +5,7 @@
 #include "errmod.h"
 #include "bcftools/bcf.h"
 
-#define B2B_INDEL_NULL 0x7fff
-#define B2B_MAX_MNP    4 // cannot be larger than 4!!!
-#define B2B_MNP_WIN    10
-#define B2B_REF_INDEL  (-1)
-#define B2B_REF_MNP    (-2)
+#define B2B_INDEL_NULL 10000
 
 typedef struct __bcf_callaux_t {
        int capQ, min_baseQ;
@@ -17,7 +13,7 @@ typedef struct __bcf_callaux_t {
        // for internal uses
        int max_bases;
        int indel_types[4];
-       int maxins, indelreg, last_mnp_pos;
+       int maxins, indelreg;
        char *inscns;
        uint16_t *bases;
        errmod_t *e;
@@ -32,7 +28,7 @@ typedef struct {
 
 typedef struct {
        int a[5]; // alleles: ref, alt, alt2, alt3
-       int n, n_alleles, shift, ori_ref, unseen; // ori_ref can be B2B_REF_INDEL/B2B_REF_MNP
+       int n, n_alleles, shift, ori_ref, unseen;
        int anno[16], depth, ori_depth;
        uint8_t *PL;
 } bcf_call_t;
@@ -49,7 +45,6 @@ extern "C" {
                                         const bcf_callaux_t *bca, const char *ref);
        int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
                                                  const void *rghash);
-       int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
 
 #ifdef __cplusplus
 }
index 4f747b156f439bf5d6e516cea3856bf5054e831c..819c1e7b1371296b7da5536939fab691f53c1722 100644 (file)
@@ -408,153 +408,3 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
        free(types); free(inscns);
        return n_alt > 0? 0 : -1;
 }
-
-#define END_SKIP 4 // must be larger than B2B_MAX_MNP
-#define MIN_MNP_FLANK_BAQ 20
-
-int bcf_call_mnp_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
-{
-       extern void ks_introsort_uint32_t(int, uint32_t*);
-       int i, s, j, k, t, n_types, *types;
-       int N, ref_seq, ref_type;
-       if (ref == 0 || bca == 0) return -1;
-       if (pos < bca->last_mnp_pos + B2B_MNP_WIN) return -2; // to avoid calling a TNP multiple times
-       if (pos < END_SKIP || ref[pos] == 0 || ref[pos+1] == 0) return -2; // end of the reference
-       { // determine if there is an MNP
-               int r[2];
-               uint8_t *tt;
-               r[0] = bam_nt16_table[(int)ref[pos]];
-               r[1] = bam_nt16_table[(int)ref[pos+1]];
-               for (s = 0; s < n; ++s) {
-                       for (i = 0; i < n_plp[s]; ++i) {
-                               bam_pileup1_t *p = plp[s] + i;
-                               int left, rght;
-                               if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue;
-                               tt = bam1_seq(p->b);
-                               if (bam1_seqi(tt, p->qpos) == r[0] || bam1_seqi(tt, p->qpos+1) == r[1]) continue; // no MNP
-                               tt = bam1_qual(p->b);
-                               for (left = 0, k = p->qpos - END_SKIP; k <= p->qpos; ++k)
-                                       left = left > tt[k]? left : tt[k];
-                               for (rght = 0, k = p->qpos; k < p->b->core.l_qseq - END_SKIP; ++k)
-                                       rght = rght > tt[k]? left : tt[k];
-                               if (left >= MIN_MNP_FLANK_BAQ && rght >= MIN_MNP_FLANK_BAQ) break; // bracketed by good bases
-                       }
-                       if (i != n_plp[s]) break;
-               }
-               if (s == n) return -1; // there is no MNP at this position.
-       }
-       for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
-       { // construct the MNP consensus
-               uint8_t *tt;
-               uint32_t *aux, x, *cnt;
-               int m = 0;
-               aux = calloc(N + 1, 4);
-               for (i = 0, x = 0; i < B2B_MAX_MNP; ++i)
-                       x |= bam_nt16_nt4_table[bam_nt16_table[(int)ref[pos+i]]] << 2*i;
-               ref_seq = x;
-               bca->indelreg = 0;
-               for (s = 0; s < n; ++s) {
-                       for (i = 0; i < n_plp[s]; ++i) {
-                               bam_pileup1_t *p = plp[s] + i;
-                               int stop;
-                               if (p->qpos < END_SKIP || p->qpos >= p->b->core.l_qseq - END_SKIP) continue;
-                               tt = bam1_seq(p->b);
-                               for (k = j = stop = 0, x = 0; k < B2B_MAX_MNP; ++k) {
-                                       int c = bam_nt16_nt4_table[bam1_seqi(tt, p->qpos + k)];
-                                       if (c > 3) break;
-                                       if (c != (ref_seq>>k*2&3) && !stop) ++j;
-                                       else stop = 1;
-                                       x |= c << 2*k;
-                               }
-                               if (k == B2B_MAX_MNP && j >= 2) aux[m++] = x;
-                       }
-               }
-               if (m == 0) {
-                       free(aux); return -1;
-               }
-               ks_introsort(uint32_t, m, aux);
-               // squeeze out indentical types
-               for (i = 1, n_types = 1; i < m; ++i)
-                       if (aux[i] != aux[i-1]) ++n_types;
-               // count reads for each type
-               cnt = alloca(n_types * 4);
-               cnt[0] = 1<<8 | aux[0];
-               for (i = 1, t = 0; i < m; ++i) {
-                       if (aux[i] != aux[i-1]) {
-                               ++t;
-                               cnt[t] = 1<<8 | aux[i];
-                       } else cnt[t] += 1<<8;
-               }
-               free(aux);
-               // collect types; NOTE: we only collect ONE alternative type
-               ks_introsort(uint32_t, n_types, cnt);
-               if ((cnt[n_types-1]>>8) * MIN_SUPPORT_COEF < N) // no MNPs or too few supporting reads
-                       return -1;
-               types = (int*)calloc(2, sizeof(int));
-               bca->indel_types[0] = types[0] = ref_seq;
-               bca->indel_types[1] = types[1] = cnt[n_types-1]&0xff;
-               bca->indel_types[2] = bca->indel_types[3] = B2B_INDEL_NULL;
-               ref_type = 0; n_types = 2;
-               // calculate MNP length
-               for (i = 0; i < B2B_MAX_MNP; ++i) {
-                       int c = types[0] >> 2*i & 3;
-                       for (t = 1; t < n_types; ++t)
-                               if ((types[t] >> 2*i & 3) != c) break;
-                       if (t == n_types) break;
-               }
-               bca->indelreg = i;
-               if (bca->indelreg < 2) return -1; // should not happen in principle
-               x = (1<<2*bca->indelreg) - 1;
-               for (t = 0; t < n_types; ++t) types[t] &= x;
-               ref_seq &= x;
-       }
-       { // calling
-               for (s = 0; s < n; ++s) {
-                       for (i = 0; i < n_plp[s]; ++i) {
-                               bam_pileup1_t *p = plp[s] + i;
-                               uint8_t *qq, *bq, *seq = bam1_seq(p->b);
-                               uint32_t x, *cigar = bam1_cigar(p->b);
-                               int y, skip = 0, baseQ = 0, baq = 0;
-                               // try to get the original base quality
-                               qq = bam1_qual(p->b);
-                               bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
-                               if (bq) ++bq;
-                               // get the max original base quality
-                               for (j = p->qpos, x = 0; j < p->b->core.l_qseq && j < p->qpos + bca->indelreg; ++j) {
-                                       int q = bq? qq[j] + (bq[j] - 64) : qq[j];
-                                       int c = bam_nt16_nt4_table[bam1_seqi(seq, j)];
-                                       baseQ = baseQ > q? baseQ : q;
-                                       if (c > 3) skip = 1;
-                                       x |= c << 2 * (j - p->qpos);
-                               }
-                               for (t = 0; t < n_types; ++t)
-                                       if (types[t] == x) break;
-                               if (t == n_types) skip = 1;
-                               else { // reapply BAQ
-                                       for (k = y = 0; k < p->b->core.n_cigar; ++k) {
-                                               int op = cigar[k]&0xf;
-                                               int len = cigar[k]>>4;
-                                               if (op == BAM_CMATCH) {
-                                                       if (p->qpos >= y && p->qpos + bca->indelreg <= y + len) {
-                                                               int left, rght;
-                                                               for (j = y, left = 0; j <= p->qpos; ++j)
-                                                                       left = left > qq[j]? left : qq[j];
-                                                               for (j = p->qpos + bca->indelreg - 1, rght = 0; j < y + len; ++j)
-                                                                       rght = rght > qq[j]? rght : qq[j];
-                                                               baq = left < rght? left : rght;
-                                                               break;
-                                                       }
-                                                       y += len;
-                                               } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += len;
-                                       }
-                               }
-//                             fprintf(stderr, "pos=%d read=%d:%d name=%s skip=%d reg=%d call=%d baseQ=%d baq=%d\n", pos+1, s, i, bam1_qname(p->b), skip, bca->indelreg, t, baseQ, baq);
-                               baseQ = baseQ < baq? baseQ : baq;
-                               p->aux = skip? 0 : (t<<16|baseQ<<8|baseQ);
-                       }
-               }
-       }
-       // free
-       free(types);
-       return 0;
-}
index 7f13eee4ed51a987a6386796eb4eeae2bad65891..e3f73aaddd03a14343a82084ee060d9c41ba42d4 100644 (file)
@@ -738,31 +738,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
                                                 (conf->flag&MPLP_FMT_SP), 0, 0);
                        bcf_write(bp, bh, b);
                        bcf_destroy(b);
-                       if (!(conf->flag&MPLP_NO_INDEL)) {
-                               // call MNPs
-                               if (bcf_call_mnp_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref) >= 0) {
-                                       for (i = 0; i < gplp.n; ++i)
-                                               bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_MNP, bca, bcr + i);
-                                       if (bcf_call_combine(gplp.n, bcr, B2B_REF_MNP, &bc) >= 0) {
-                                               b = calloc(1, sizeof(bcf1_t));
-                                               bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
-                                                                        (conf->flag&MPLP_FMT_SP), bca, ref);
-                                               bcf_write(bp, bh, b);
-                                               bcf_destroy(b);
-                                               bca->last_mnp_pos = pos;
-                                       }
-                               }
-                               // call indels
-                               if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
-                                       for (i = 0; i < gplp.n; ++i)
-                                               bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], B2B_REF_INDEL, bca, bcr + i);
-                                       if (bcf_call_combine(gplp.n, bcr, B2B_REF_INDEL, &bc) >= 0) {
-                                               b = calloc(1, sizeof(bcf1_t));
-                                               bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
-                                                                        (conf->flag&MPLP_FMT_SP), bca, ref);
-                                               bcf_write(bp, bh, b);
-                                               bcf_destroy(b);
-                                       }
+                       // call indels
+                       if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+                               for (i = 0; i < gplp.n; ++i)
+                                       bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
+                               if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
+                                       b = calloc(1, sizeof(bcf1_t));
+                                       bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+                                                                (conf->flag&MPLP_FMT_SP), bca, ref);
+                                       bcf_write(bp, bh, b);
+                                       bcf_destroy(b);
                                }
                        }
                } else {
diff --git a/bamtk.c b/bamtk.c
index 1d0a758cada1f0b2418385139b6b2b46c6ffafd3..f3b09d64b8754ca05ad9295e46d3ad800cb53568 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.10-6 (r839)"
+#define PACKAGE_VERSION "0.1.10-7 (r840)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);