]> git.donarmstrong.com Git - samtools.git/commitdiff
Unfinished modification. Please do not use this revision...
authorHeng Li <lh3@live.co.uk>
Thu, 8 Oct 2009 14:19:16 +0000 (14:19 +0000)
committerHeng Li <lh3@live.co.uk>
Thu, 8 Oct 2009 14:19:16 +0000 (14:19 +0000)
Makefile
bam.c
bam.h
bam_maqcns.c
bamtk.c
kaln.c [new file with mode: 0644]
kaln.h [new file with mode: 0644]

index 450b3abbb83876089fbf1da30df600431a272a6e..90f613e985f424eec0f962c741bb4ba22be22ce0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ LOBJS=          bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
                        bam_sort.o
 AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
                        bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
-                       bamtk.o
+                       bamtk.o kaln.o
 PROG=          samtools
 INCLUDES=
 SUBDIRS=       . misc
diff --git a/bam.c b/bam.c
index f6e05ff4285ef6743387e33942b91c91bf0e85fc..533250b95771d73d626a247ca58bc1c2a167fee6 100644 (file)
--- a/bam.c
+++ b/bam.c
@@ -13,26 +13,32 @@ char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
  * CIGAR related routines *
  **************************/
 
-int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg)
+int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int32_t *_tpos)
 {
-       unsigned k;
-       int32_t x = c->pos, y = 0;
-       int state = 0;
+       int k, x = c->pos, y = 0, last_y = 0;
+       *_tpos = c->pos;
        for (k = 0; k < c->n_cigar; ++k) {
-               int op = cigar[k] & BAM_CIGAR_MASK; // operation
-               int l = cigar[k] >> BAM_CIGAR_SHIFT; // length
-               if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) {
-                       reg->tbeg = x; reg->qbeg = y; reg->cbeg = k;
-                       state = 1;
-               }
-               if (op == BAM_CMATCH) { x += l; y += l; }
-               else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
-               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
-               if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) {
-                       reg->tend = x; reg->qend = y; reg->cend = k;
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               int l = cigar[k] >> BAM_CIGAR_SHIFT;
+               if (op == BAM_CMATCH) {
+                       if (c->pos > tpos) return y;
+                       if (x + l > tpos) {
+                               *_tpos = tpos;
+                               return y + (tpos - x);
+                       }
+                       x += l; y += l;
+                       last_y = y;
+               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+                       if (x + l > tpos) {
+                               *_tpos = x;
+                               return y;
+                       }
+                       x += l;
                }
        }
-       return state? 0 : -1;
+       *_tpos = x;
+       return last_y;
 }
 
 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
diff --git a/bam.h b/bam.h
index ec983dfdd183042ef67a019e215a6dff6724c71f..b0fcd9c4c76354dde31a0e184720c74e42254634 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -632,13 +632,7 @@ extern "C" {
        */
        int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
 
-       typedef struct {
-               int32_t qbeg, qend;
-               int32_t tbeg, tend;
-               int32_t cbeg, cend;
-       } bam_segreg_t;
-
-       int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
+       int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int32_t *_tpos);
 
 #ifdef __cplusplus
 }
index f36b0ee2ab443affe0635866a8d593c5cb54fdf7..f4cdcb0fb383202c8d64f57fa15fe76f1b6cab6b 100644 (file)
@@ -2,9 +2,10 @@
 #include "bam.h"
 #include "bam_maqcns.h"
 #include "ksort.h"
+#include "kaln.h"
 KSORT_INIT_GENERIC(uint32_t)
 
-#define MAX_WINDOW 33
+#define INDEL_WINDOW_SIZE 50
 
 typedef struct __bmc_aux_t {
        int max;
@@ -170,7 +171,7 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
                        if (w[k] < 0xff) ++w[k];
                        ++b->c[k&3];
                }
-               tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ;
+               tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ;
                rms += tmp * tmp;
        }
        b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
@@ -309,7 +310,7 @@ void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
 bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
                                                                 int _n_types, int *_types)
 {
-       int i, j, n_types, *types, left, right;
+       int i, j, n_types, *types, left, right, max_rd_len = 0;
        bam_maqindel_ret_t *ret = 0;
        // if there is no proposed indel, check if there is an indel from the alignment
        if (_n_types == 0) {
@@ -329,6 +330,8 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        const bam_pileup1_t *p = pl + i;
                        if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
                                aux[m++] = MINUS_CONST + p->indel;
+                       j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+                       if (j > max_rd_len) max_rd_len = j;
                }
                if (_n_types) // then also add this to aux[]
                        for (i = 0; i < _n_types; ++i)
@@ -347,23 +350,14 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                free(aux);
        }
        { // calculate left and right boundary
-               bam_segreg_t seg;
-               left = 0x7fffffff; right = 0;
-               for (i = 0; i < n; ++i) {
-                       const bam_pileup1_t *p = pl + i;
-                       if (!(p->b->core.flag&BAM_FUNMAP)) {
-                               bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg);
-                               if (seg.tbeg < left) left = seg.tbeg;
-                               if (seg.tend > right) right = seg.tend;
-                       }
-               }
-               if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW;
-               if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW;
+               left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+               right = pos + INDEL_WINDOW_SIZE;
        }
        { // the core part
-               char *ref2, *inscns = 0;
+               char *ref2, *rs, *inscns = 0;
                int k, l, *score, *pscore, max_ins = types[n_types-1];
                ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+               rs   = (char*)calloc(right - left + max_rd_len + types[n_types-1] + 2, 1);
                if (max_ins > 0) { // get the consensus of inserted sequences
                        int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
                        // count occurrences
@@ -399,49 +393,53 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                score = (int*)calloc(n_types * n, sizeof(int));
                pscore = (int*)calloc(n_types * n, sizeof(int));
                for (i = 0; i < n_types; ++i) {
+                       ka_param_t ap = ka_param_blast;
+                       ap.band_width = 2 * types[n_types - 1] + 2;
                        // write ref2
                        for (k = 0, j = left; j <= pos; ++j)
-                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                               ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
                        if (types[i] <= 0) j += -types[i];
                        else for (l = 0; l < types[i]; ++l)
                                         ref2[k++] = inscns[i*max_ins + l];
                        for (; j < right && ref[j]; ++j)
-                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                               ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+                       if (j < right) right = j;
                        // calculate score for each read
                        for (j = 0; j < n; ++j) {
                                const bam_pileup1_t *p = pl + j;
-                               uint32_t *cigar;
-                               bam1_core_t *c = &p->b->core;
-                               int s, ps;
-                               bam_segreg_t seg;
-                               if (c->flag&BAM_FUNMAP) continue;
-                               cigar = bam1_cigar(p->b);
-                               bam_segreg(pos, c, cigar, &seg);
-                               for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
-                                       int cq = bam1_seqi(bam1_seq(p->b), l), ct;
-                                       // in the following line, "<" will happen if reads are too long
-                                       ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15;
-                                       if (cq < 15 && ct < 15) {
-                                               s += cq == ct? 1 : -mi->mm_penalty;
-                                               if (cq != ct) ps += bam1_qual(p->b)[l];
-                                       }
-                               }
-                               score[i*n + j] = s; pscore[i*n + j] = ps;
-                               if (types[i] != 0) { // then try the other way to calculate the score
-                                       for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
-                                               int cq = bam1_seqi(bam1_seq(p->b), l), ct;
-                                               ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15;
-                                               if (cq < 15 && ct < 15) {
-                                                       s += cq == ct? 1 : -mi->mm_penalty;
-                                                       if (cq != ct) ps += bam1_qual(p->b)[l];
+                               int qbeg, qend, tbeg, tend;
+                               if (p->b->core.flag & BAM_FUNMAP) continue;
+                               qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left,  &tbeg);
+                               qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, &tend);
+                               for (l = qbeg; l < qend; ++l)
+                                       rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)];
+                               {
+                                       int x, y, n_acigar, ps;
+                                       uint32_t *acigar;
+                                       ps = 0;
+                                       acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar);
+                                       x = tbeg - left; y = 0;
+                                       for (l = 0; l < n_acigar; ++l) {
+                                               int op = acigar[l]&0xf;
+                                               int len = acigar[l]>>4;
+                                               if (op == BAM_CMATCH) {
+                                                       int k;
+                                                       for (k = 0; k < len; ++k)
+                                                               if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k];
+                                                       x += len; y += len;
+                                               } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                                                       if (op == BAM_CINS) ps += mi->q_indel * len;
+                                                       y += len;
+                                               } else if (op == BAM_CDEL) {
+                                                       ps += mi->q_indel * len;
+                                                       x += len;
                                                }
                                        }
+                                       pscore[i*n+j] = ps;
+                                       //printf("pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend);
+                                       //for (l = 0; l < n_acigar; ++l) printf("%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); printf("\n");
+                                       free(acigar);
                                }
-                               if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
-                               if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
-                               //if (types[i] != 0) score[i*n+j] -= mi->indel_err;
-                               //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j,
-                               //         score[i*n+j], pscore[i*n+j]);
                        }
                }
                { // get final result
@@ -509,7 +507,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                }
                        }
                }
-               free(score); free(pscore); free(ref2); free(inscns);
+               free(score); free(pscore); free(ref2); free(rs); free(inscns);
        }
        { // call genotype
                int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
diff --git a/bamtk.c b/bamtk.c
index 74deac7de55761d0afec00020fb84960398caac6..042f78daf9ae67868856e823271112fb1d6bb9bb 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.6-9 (r470)"
+#define PACKAGE_VERSION "0.1.6-10 (r474)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/kaln.c b/kaln.c
new file mode 100644 (file)
index 0000000..65f6c49
--- /dev/null
+++ b/kaln.c
@@ -0,0 +1,380 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+       int i, j;
+       unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/*      A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
+        4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+       -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+       -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+       -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+        0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+       -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+       -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+        0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+       -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+       -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+       -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+       -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+       -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+       -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+       -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+        1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+        0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+       -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+       -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+        0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+       -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+        0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+       1, -3, -3, -3, -2,
+       -3, 1, -3, -3, -2,
+       -3, -3, 1, -3, -2,
+       -3, -3, -3, 1, -2,
+       -2, -2, -2, -2, -2
+};
+
+ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+       int i, n;
+       uint32_t *cigar;
+       unsigned char last_type;
+
+       if (path_len == 0 || path == 0) {
+               *n_cigar = 0;
+               return 0;
+       }
+
+       last_type = path->ctype;
+       for (i = n = 1; i < path_len; ++i) {
+               if (last_type != path[i].ctype) ++n;
+               last_type = path[i].ctype;
+       }
+       *n_cigar = n;
+       cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+       cigar[0] = 1u << 4 | path[path_len-1].ctype;
+       last_type = path[path_len-1].ctype;
+       for (i = path_len - 2, n = 0; i >= 0; --i) {
+               if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+               else {
+                       cigar[++n] = 1u << 4 | path[i].ctype;
+                       last_type = path[i].ctype;
+               }
+       }
+
+       return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc)                                                  \
+{                                                                                                              \
+       if ((p)->M >= (p)->I) {                                                         \
+               if ((p)->M >= (p)->D) {                                                 \
+                       (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       } else {                                                                                        \
+               if ((p)->I > (p)->D) {                                                  \
+                       (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       }                                                                                                       \
+}
+#define set_I(II, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->I) {                                       \
+               (cur)->It = FROM_M;                                                             \
+               (II) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->It = FROM_I;                                                             \
+               (II) = (p)->I - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_I(II, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->I) {                               \
+                       (cur)->It = FROM_M;                                                     \
+                       (II) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->It = FROM_I;                                                     \
+                       (II) = (p)->I - gap_end;                                        \
+               }                                                                                               \
+       } else set_I(II, cur, p);                                                       \
+}
+#define set_D(DD, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->D) {                                       \
+               (cur)->Dt = FROM_M;                                                             \
+               (DD) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->Dt = FROM_D;                                                             \
+               (DD) = (p)->D - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_D(DD, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->D) {                               \
+                       (cur)->Dt = FROM_M;                                                     \
+                       (DD) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->Dt = FROM_D;                                                     \
+                       (DD) = (p)->D - gap_end;                                        \
+               }                                                                                               \
+       } else set_D(DD, cur, p);                                                       \
+}
+
+typedef struct {
+       uint8_t Mt:3, It:2, Dt:2;
+} dpcell_t;
+
+typedef struct {
+       int M, I, D;
+} dpscore_t;
+
+/* build score profile for accelerating alignment, in theory */
+static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
+{
+       int *tmp, *tmp2, i, k;
+       for (i = 0; i != row; ++i) {
+               tmp = score_matrix + i * row;
+               tmp2 = s_array[i];
+               for (k = 0; k != len; ++k)
+                       tmp2[k] = tmp[seq[k]];
+       }
+}
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+       int i, j;
+       dpcell_t **dpcell, *q;
+       dpscore_t *curr, *last, *s;
+       int b1, b2, tmp_end;
+       int *mat, end, max = 0;
+       uint8_t type, ctype;
+       uint32_t *cigar = 0;
+
+       int gap_open, gap_ext, gap_end, b;
+       int *score_matrix, N_MATRIX_ROW;
+
+       /* initialize some align-related parameters. just for compatibility */
+       gap_open = ap->gap_open;
+       gap_ext = ap->gap_ext;
+       gap_end = ap->gap_end;
+       b = ap->band_width;
+       score_matrix = ap->matrix;
+       N_MATRIX_ROW = ap->row;
+       
+       if (len1 == 0 || len2 == 0) return 0;
+
+       /* calculate b1 and b2 */
+       if (len1 > len2) {
+               b1 = len1 - len2 + b;
+               b2 = b;
+       } else {
+               b1 = b;
+               b2 = len2 - len1 + b;
+       }
+       if (b1 > len1) b1 = len1;
+       if (b2 > len2) b2 = len2;
+       --seq1; --seq2;
+
+       /* allocate memory */
+       end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+       dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+       for (j = 0; j <= len2; ++j)
+               dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] -= j - b2;
+       curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       
+       /* set first row */
+       SET_INF(*curr); curr->M = 0;
+       for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+               SET_INF(*s);
+               set_end_D(s->D, dpcell[0] + i, s - 1);
+       }
+       s = curr; curr = last; last = s;
+
+       /* core dynamic programming, part 1 */
+       tmp_end = (b2 < len2)? b2 : len2 - 1;
+       for (j = 1; j <= tmp_end; ++j) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+       /* last row for part 1, use set_end_D() instead of set_D() */
+       if (j == len2 && b2 != len2 - 1) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_end_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+               ++j;
+       }
+
+       /* core dynamic programming, part 2 */
+       for (; j <= len2 - b2 + 1; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               end = j + b1 - 1;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+
+       /* core dynamic programming, part 3 */
+       for (; j < len2; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+       /* last row */
+       if (j == len2) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_end_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+
+       *_score = last[len1].M;
+       if (n_cigar) { /* backtrace */
+               path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+               i = len1; j = len2;
+               q = dpcell[j] + i;
+               s = last + len1;
+               max = s->M; type = q->Mt; ctype = FROM_M;
+               if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+               if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+               p = path;
+               p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+               ++p;
+               do {
+                       switch (ctype) {
+                       case FROM_M: --i; --j; break;
+                       case FROM_I: --j; break;
+                       case FROM_D: --i; break;
+                       }
+                       q = dpcell[j] + i;
+                       ctype = type;
+                       switch (type) {
+                       case FROM_M: type = q->Mt; break;
+                       case FROM_I: type = q->It; break;
+                       case FROM_D: type = q->Dt; break;
+                       }
+                       p->ctype = ctype; p->i = i; p->j = j;
+                       ++p;
+               } while (i || j);
+               cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+               free(path);
+       }
+
+       /* free memory */
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] += j - b2;
+       for (j = 0; j <= len2; ++j)
+               free(dpcell[j]);
+       free(dpcell);
+       free(curr); free(last);
+
+       return cigar;
+}
diff --git a/kaln.h b/kaln.h
new file mode 100644 (file)
index 0000000..b04d8cc
--- /dev/null
+++ b/kaln.h
@@ -0,0 +1,55 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef LH3_KALN_H_
+#define LH3_KALN_H_
+
+#include <stdint.h>
+
+#define MINOR_INF -1073741823
+
+typedef struct {
+       int gap_open;
+       int gap_ext;
+       int gap_end;
+
+       int *matrix;
+       int row;
+       int band_width;
+} ka_param_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+extern ka_param_t ka_param_blast; /* = {  5,  2,  2, aln_sm_blast, 5, 50 }; */
+
+#endif