From 020baf39a16a31e33a736d347714e2551f6273df Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 8 Oct 2009 14:19:16 +0000 Subject: [PATCH] Unfinished modification. Please do not use this revision... --- Makefile | 2 +- bam.c | 38 +++--- bam.h | 8 +- bam_maqcns.c | 94 +++++++------ bamtk.c | 2 +- kaln.c | 380 +++++++++++++++++++++++++++++++++++++++++++++++++++ kaln.h | 55 ++++++++ 7 files changed, 506 insertions(+), 73 deletions(-) create mode 100644 kaln.c create mode 100644 kaln.h diff --git a/Makefile b/Makefile index 450b3ab..90f613e 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_sort.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o + bamtk.o kaln.o PROG= samtools INCLUDES= SUBDIRS= . misc diff --git a/bam.c b/bam.c index f6e05ff..533250b 100644 --- a/bam.c +++ b/bam.c @@ -13,26 +13,32 @@ char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; * CIGAR related routines * **************************/ -int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg) +int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int32_t *_tpos) { - unsigned k; - int32_t x = c->pos, y = 0; - int state = 0; + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; // operation - int l = cigar[k] >> BAM_CIGAR_SHIFT; // length - if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) { - reg->tbeg = x; reg->qbeg = y; reg->cbeg = k; - state = 1; - } - if (op == BAM_CMATCH) { x += l; y += l; } - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) { - reg->tend = x; reg->qend = y; reg->cend = k; + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = x; + return y; + } + x += l; } } - return state? 0 : -1; + *_tpos = x; + return last_y; } uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) diff --git a/bam.h b/bam.h index ec983df..b0fcd9c 100644 --- a/bam.h +++ b/bam.h @@ -632,13 +632,7 @@ extern "C" { */ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); - typedef struct { - int32_t qbeg, qend; - int32_t tbeg, tend; - int32_t cbeg, cend; - } bam_segreg_t; - - int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg); + int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int32_t *_tpos); #ifdef __cplusplus } diff --git a/bam_maqcns.c b/bam_maqcns.c index f36b0ee..f4cdcb0 100644 --- a/bam_maqcns.c +++ b/bam_maqcns.c @@ -2,9 +2,10 @@ #include "bam.h" #include "bam_maqcns.h" #include "ksort.h" +#include "kaln.h" KSORT_INIT_GENERIC(uint32_t) -#define MAX_WINDOW 33 +#define INDEL_WINDOW_SIZE 50 typedef struct __bmc_aux_t { int max; @@ -170,7 +171,7 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam if (w[k] < 0xff) ++w[k]; ++b->c[k&3]; } - tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ; + tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ; rms += tmp * tmp; } b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); @@ -309,7 +310,7 @@ void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, int _n_types, int *_types) { - int i, j, n_types, *types, left, right; + int i, j, n_types, *types, left, right, max_rd_len = 0; bam_maqindel_ret_t *ret = 0; // if there is no proposed indel, check if there is an indel from the alignment if (_n_types == 0) { @@ -329,6 +330,8 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c const bam_pileup1_t *p = pl + i; if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) aux[m++] = MINUS_CONST + p->indel; + j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; } if (_n_types) // then also add this to aux[] for (i = 0; i < _n_types; ++i) @@ -347,23 +350,14 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c free(aux); } { // calculate left and right boundary - bam_segreg_t seg; - left = 0x7fffffff; right = 0; - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!(p->b->core.flag&BAM_FUNMAP)) { - bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg); - if (seg.tbeg < left) left = seg.tbeg; - if (seg.tend > right) right = seg.tend; - } - } - if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW; - if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW; + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; } { // the core part - char *ref2, *inscns = 0; + char *ref2, *rs, *inscns = 0; int k, l, *score, *pscore, max_ins = types[n_types-1]; ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1); + rs = (char*)calloc(right - left + max_rd_len + types[n_types-1] + 2, 1); if (max_ins > 0) { // get the consensus of inserted sequences int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); // count occurrences @@ -399,49 +393,53 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c score = (int*)calloc(n_types * n, sizeof(int)); pscore = (int*)calloc(n_types * n, sizeof(int)); for (i = 0; i < n_types; ++i) { + ka_param_t ap = ka_param_blast; + ap.band_width = 2 * types[n_types - 1] + 2; // write ref2 for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = bam_nt16_table[(int)ref[j]]; + ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; if (types[i] <= 0) j += -types[i]; else for (l = 0; l < types[i]; ++l) ref2[k++] = inscns[i*max_ins + l]; for (; j < right && ref[j]; ++j) - ref2[k++] = bam_nt16_table[(int)ref[j]]; + ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; + if (j < right) right = j; // calculate score for each read for (j = 0; j < n; ++j) { const bam_pileup1_t *p = pl + j; - uint32_t *cigar; - bam1_core_t *c = &p->b->core; - int s, ps; - bam_segreg_t seg; - if (c->flag&BAM_FUNMAP) continue; - cigar = bam1_cigar(p->b); - bam_segreg(pos, c, cigar, &seg); - for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { - int cq = bam1_seqi(bam1_seq(p->b), l), ct; - // in the following line, "<" will happen if reads are too long - ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15; - if (cq < 15 && ct < 15) { - s += cq == ct? 1 : -mi->mm_penalty; - if (cq != ct) ps += bam1_qual(p->b)[l]; - } - } - score[i*n + j] = s; pscore[i*n + j] = ps; - if (types[i] != 0) { // then try the other way to calculate the score - for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { - int cq = bam1_seqi(bam1_seq(p->b), l), ct; - ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15; - if (cq < 15 && ct < 15) { - s += cq == ct? 1 : -mi->mm_penalty; - if (cq != ct) ps += bam1_qual(p->b)[l]; + int qbeg, qend, tbeg, tend; + if (p->b->core.flag & BAM_FUNMAP) continue; + qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left, &tbeg); + qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, &tend); + for (l = qbeg; l < qend; ++l) + rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)]; + { + int x, y, n_acigar, ps; + uint32_t *acigar; + ps = 0; + acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar); + x = tbeg - left; y = 0; + for (l = 0; l < n_acigar; ++l) { + int op = acigar[l]&0xf; + int len = acigar[l]>>4; + if (op == BAM_CMATCH) { + int k; + for (k = 0; k < len; ++k) + if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k]; + x += len; y += len; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + if (op == BAM_CINS) ps += mi->q_indel * len; + y += len; + } else if (op == BAM_CDEL) { + ps += mi->q_indel * len; + x += len; } } + pscore[i*n+j] = ps; + //printf("pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend); + //for (l = 0; l < n_acigar; ++l) printf("%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); printf("\n"); + free(acigar); } - if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores - if (pscore[i*n+j] > ps) pscore[i*n+j] = ps; - //if (types[i] != 0) score[i*n+j] -= mi->indel_err; - //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j, - // score[i*n+j], pscore[i*n+j]); } } { // get final result @@ -509,7 +507,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } } } - free(score); free(pscore); free(ref2); free(inscns); + free(score); free(pscore); free(ref2); free(rs); free(inscns); } { // call genotype int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); diff --git a/bamtk.c b/bamtk.c index 74deac7..042f78d 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.6-9 (r470)" +#define PACKAGE_VERSION "0.1.6-10 (r474)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/kaln.c b/kaln.c new file mode 100644 index 0000000..65f6c49 --- /dev/null +++ b/kaln.c @@ -0,0 +1,380 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include "kaln.h" + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 + +typedef struct { + int i, j; + unsigned char ctype; +} path_t; + +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; + +static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)calloc(*n_cigar, 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct { + uint8_t Mt:3, It:2, Dt:2; +} dpcell_t; + +typedef struct { + int M, I, D; +} dpscore_t; + +/* build score profile for accelerating alignment, in theory */ +static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array) +{ + int *tmp, *tmp2, i, k; + for (i = 0; i != row; ++i) { + tmp = score_matrix + i * row; + tmp2 = s_array[i]; + for (k = 0; k != len; ++k) + tmp2[k] = tmp[seq[k]]; + } +} +/*************************** + * banded global alignment * + ***************************/ +uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) +{ + int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + int b1, b2, tmp_end; + int *mat, end, max = 0; + uint8_t type, ctype; + uint32_t *cigar = 0; + + int gap_open, gap_ext, gap_end, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end = ap->gap_end; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (len1 == 0 || len2 == 0) return 0; + + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + *_score = last[len1].M; + if (n_cigar) { /* backtrace */ + path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + cigar = ka_path2cigar32(path, p - path - 1, n_cigar); + free(path); + } + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return cigar; +} diff --git a/kaln.h b/kaln.h new file mode 100644 index 0000000..b04d8cc --- /dev/null +++ b/kaln.h @@ -0,0 +1,55 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009 by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef LH3_KALN_H_ +#define LH3_KALN_H_ + +#include + +#define MINOR_INF -1073741823 + +typedef struct { + int gap_open; + int gap_ext; + int gap_end; + + int *matrix; + int row; + int band_width; +} ka_param_t; + +#ifdef __cplusplus +extern "C" { +#endif + + uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ + +#endif -- 2.39.2