]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.9-9 (r810)
authorHeng Li <lh3@live.co.uk>
Thu, 11 Nov 2010 05:18:02 +0000 (05:18 +0000)
committerHeng Li <lh3@live.co.uk>
Thu, 11 Nov 2010 05:18:02 +0000 (05:18 +0000)
 * use forward, instead of viterbi, for realignment
 * realignment is now quality aware

bam2bcf_indel.c
bam_md.c
bam_plcmd.c
bamtk.c

index 7f6e288014cf1e0b627f3e7f2150e6dfbef5a03d..ab9b499b1afeb4acd7f180002f679fddb8f9eddc 100644 (file)
@@ -236,8 +236,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
        bca->indelreg = 0;
        for (t = 0; t < n_types; ++t) {
                int l, ir;
-               ka_param2_t ap = ka_param2_qual;
-               ap.band_width = abs(types[t]) + 3;
+               kpa_par_t ap = { 1e-4, 1e-2, 10 };
+               ap.bw = abs(types[t]) + 3;
                // compute indelreg
                if (types[t] == 0) ir = 0;
                else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
@@ -264,6 +264,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                                bam_pileup1_t *p = plp[s] + i;
                                int qbeg, qend, tbeg, tend, sc;
                                uint8_t *seq = bam1_seq(p->b);
+                               // FIXME: the following skips soft clips, but using them may be more sensitive.
                                // determine the start and end of sequences for alignment
                                qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left,  0, &tbeg);
                                qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
@@ -274,17 +275,18 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                                // write the query sequence
                                for (l = qbeg; l < qend; ++l)
                                        query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
-                               // do alignment; this takes most of computing time for indel calling
-                               if (0) {
-                                       uint8_t *qq = calloc(qend - qbeg, 1);
-                                       for (l = 0; l < qend - qbeg; ++l) qq[l] = 23;
+                               { // do alignment; this is the bottleneck
+                                       const uint8_t *qual = bam1_qual(p->b), *bq;
+                                       uint8_t *qq = 0;
+                                       qq = calloc(qend - qbeg, 1);
+                                       bq = (uint8_t*)bam_aux_get(p->b, "BQ");
+                                       if (bq) ++bq;
+                                       for (l = qbeg; l < qend; ++l)
+                                               qq[l - qbeg] = bq? qual[l] + (bq[l] - 33) : qual[l];
                                        sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                                                       (uint8_t*)query, qend - qbeg, qq, &kpa_par_alt, 0, 0);
+                                                                       (uint8_t*)query, qend - qbeg, qq, &ap, 0, 0);
                                        score[K*n_types + t] = sc;
-                               } else {
-                                       sc = ka_global_score((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                                                                (uint8_t*)query, qend - qbeg, &ap);
-                                       score[K*n_types + t] = -sc;
+                                       free(qq);
                                }
 /*
                                for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
index 0377226773262dadca7185a12fa9aec755810e4f..a8ddd1baf8aa78d6911ca7a804a549a1d27eb06b 100644 (file)
--- a/bam_md.c
+++ b/bam_md.c
@@ -162,7 +162,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
        return (int)(t + .499);
 }
 
-int bam_prob_realn(bam1_t *b, const char *ref)
+int bam_prob_realn_core(bam1_t *b, const char *ref, int write_bq)
 {
        int k, i, bw, x, y, yb, ye, xb, xe;
        uint32_t *cigar = bam1_cigar(b);
@@ -193,8 +193,12 @@ int bam_prob_realn(bam1_t *b, const char *ref)
        if (xe - xb - c->l_qseq > bw)
                xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
        { // glocal
-               uint8_t *s, *r, *q, *seq = bam1_seq(b), *qual = bam1_qual(b);
+               uint8_t *s, *r, *q, *seq = bam1_seq(b), *qual = bam1_qual(b), *bq = 0;
                int *state;
+               if (write_bq) {
+                       bq = calloc(c->l_qseq + 1, 1);
+                       memcpy(bq, qual, c->l_qseq);
+               }
                s = calloc(c->l_qseq, 1);
                for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
                r = calloc(xe - xb, 1);
@@ -214,11 +218,21 @@ int bam_prob_realn(bam1_t *b, const char *ref)
                        } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
                        else if (op == BAM_CDEL) x += l;
                }
+               if (write_bq) {
+                       for (i = 0; i < c->l_qseq; ++i) bq[i] = bq[i] - qual[i] + 33;
+                       bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
+                       free(bq);
+               }
                free(s); free(r); free(q); free(state);
        }
        return 0;
 }
 
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+       return bam_prob_realn_core(b, ref, 0);
+}
+
 int bam_fillmd(int argc, char *argv[])
 {
        int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0, is_realn, capQ = 0;
@@ -276,7 +290,7 @@ int bam_fillmd(int argc, char *argv[])
                                        fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
                                                        fp->header->target_name[tid]);
                        }
-                       if (is_realn) bam_prob_realn(b, ref);
+                       if (is_realn) bam_prob_realn_core(b, ref, 0);
                        if (capQ > 10) {
                                int q = bam_cap_mapQ(b, ref, capQ);
                                if (b->core.qual > q) b->core.qual = q;
index 1dc819a6d8870782dd94eb5d446821c4622d4b94..ad6134b3ee7cdeb179c3a86fd46658b0aef83aef 100644 (file)
@@ -556,7 +556,7 @@ typedef struct {
 static int mplp_func(void *data, bam1_t *b)
 {
        extern int bam_realn(bam1_t *b, const char *ref);
-       extern int bam_prob_realn(bam1_t *b, const char *ref);
+       extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
        extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
        mplp_aux_t *ma = (mplp_aux_t*)data;
        int ret, skip = 0;
@@ -565,7 +565,7 @@ static int mplp_func(void *data, bam1_t *b)
                ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
                if (ret < 0) break;
                skip = 0;
-               if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn(b, ma->ref);
+               if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, 1);
                if (has_ref && ma->capQ_thres > 10) {
                        int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres);
                        if (q < 0) skip = 1;
diff --git a/bamtk.c b/bamtk.c
index 04a7899bceade9d1ef37579c40007b9a2348ad51..922dee3d4d9d619008a80d404cc2779766864cb6 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.9-8 (r809)"
+#define PACKAGE_VERSION "0.1.9-9 (r810)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);