3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 int aln_sm_blosum62[] = {
43 /* A R N D C Q E G H I L K M F P S T W Y V * X */
44 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
45 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
46 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
47 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
48 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
49 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
50 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
51 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
52 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
53 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
54 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
55 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
56 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
57 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
58 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
59 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
60 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
61 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
62 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
63 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
64 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
65 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
68 int aln_sm_blast[] = {
84 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
85 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
87 ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
89 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
93 unsigned char last_type;
95 if (path_len == 0 || path == 0) {
100 last_type = path->ctype;
101 for (i = n = 1; i < path_len; ++i) {
102 if (last_type != path[i].ctype) ++n;
103 last_type = path[i].ctype;
106 cigar = (uint32_t*)calloc(*n_cigar, 4);
108 cigar[0] = 1u << 4 | path[path_len-1].ctype;
109 last_type = path[path_len-1].ctype;
110 for (i = path_len - 2, n = 0; i >= 0; --i) {
111 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
113 cigar[++n] = 1u << 4 | path[i].ctype;
114 last_type = path[i].ctype;
121 /***************************/
122 /* START OF common_align.c */
123 /***************************/
125 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
127 #define set_M(MM, cur, p, sc) \
129 if ((p)->M >= (p)->I) { \
130 if ((p)->M >= (p)->D) { \
131 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
133 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
136 if ((p)->I > (p)->D) { \
137 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
139 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
143 #define set_I(II, cur, p) \
145 if ((p)->M - gap_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_open - gap_ext; \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_ext; \
153 #define set_end_I(II, cur, p) \
155 if (gap_end_ext >= 0) { \
156 if ((p)->M - gap_end_open > (p)->I) { \
157 (cur)->It = FROM_M; \
158 (II) = (p)->M - gap_end_open - gap_end_ext; \
160 (cur)->It = FROM_I; \
161 (II) = (p)->I - gap_end_ext; \
163 } else set_I(II, cur, p); \
165 #define set_D(DD, cur, p) \
167 if ((p)->M - gap_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_open - gap_ext; \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_ext; \
175 #define set_end_D(DD, cur, p) \
177 if (gap_end_ext >= 0) { \
178 if ((p)->M - gap_end_open > (p)->D) { \
179 (cur)->Dt = FROM_M; \
180 (DD) = (p)->M - gap_end_open - gap_end_ext; \
182 (cur)->Dt = FROM_D; \
183 (DD) = (p)->D - gap_end_ext; \
185 } else set_D(DD, cur, p); \
189 uint8_t Mt:3, It:2, Dt:3;
196 /***************************
197 * banded global alignment *
198 ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
202 dpcell_t **dpcell, *q;
203 dpscore_t *curr, *last, *s;
205 int *mat, end, max = 0;
209 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
210 int *score_matrix, N_MATRIX_ROW;
212 /* initialize some align-related parameters. just for compatibility */
213 gap_open = ap->gap_open;
214 gap_ext = ap->gap_ext;
215 gap_end_open = ap->gap_end_open;
216 gap_end_ext = ap->gap_end_ext;
218 score_matrix = ap->matrix;
219 N_MATRIX_ROW = ap->row;
221 if (n_cigar) *n_cigar = 0;
222 if (len1 == 0 || len2 == 0) return 0;
224 /* calculate b1 and b2 */
226 b1 = len1 - len2 + b;
230 b2 = len2 - len1 + b;
232 if (b1 > len1) b1 = len1;
233 if (b2 > len2) b2 = len2;
236 /* allocate memory */
237 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
238 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
239 for (j = 0; j <= len2; ++j)
240 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
241 for (j = b2 + 1; j <= len2; ++j)
243 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
244 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
247 SET_INF(*curr); curr->M = 0;
248 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
250 set_end_D(s->D, dpcell[0] + i, s - 1);
252 s = curr; curr = last; last = s;
254 /* core dynamic programming, part 1 */
255 tmp_end = (b2 < len2)? b2 : len2 - 1;
256 for (j = 1; j <= tmp_end; ++j) {
257 q = dpcell[j]; s = curr; SET_INF(*s);
258 set_end_I(s->I, q, last);
259 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
260 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
262 for (i = 1; i != end; ++i, ++s, ++q) {
263 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
264 set_I(s->I, q, last + i);
265 set_D(s->D, q, s - 1);
267 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
268 set_D(s->D, q, s - 1);
269 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
270 set_end_I(s->I, q, last + i);
271 } else s->I = MINOR_INF;
272 s = curr; curr = last; last = s;
274 /* last row for part 1, use set_end_D() instead of set_D() */
275 if (j == len2 && b2 != len2 - 1) {
276 q = dpcell[j]; s = curr; SET_INF(*s);
277 set_end_I(s->I, q, last);
278 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
279 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
281 for (i = 1; i != end; ++i, ++s, ++q) {
282 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
283 set_I(s->I, q, last + i);
284 set_end_D(s->D, q, s - 1);
286 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
287 set_end_D(s->D, q, s - 1);
288 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
289 set_end_I(s->I, q, last + i);
290 } else s->I = MINOR_INF;
291 s = curr; curr = last; last = s;
295 /* core dynamic programming, part 2 */
296 for (; j <= len2 - b2 + 1; ++j) {
297 SET_INF(curr[j - b2]);
298 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
300 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
301 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
302 set_I(s->I, q, last + i);
303 set_D(s->D, q, s - 1);
305 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
306 set_D(s->D, q, s - 1);
308 s = curr; curr = last; last = s;
311 /* core dynamic programming, part 3 */
312 for (; j < len2; ++j) {
313 SET_INF(curr[j - b2]);
314 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
315 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
316 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
317 set_I(s->I, q, last + i);
318 set_D(s->D, q, s - 1);
320 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
321 set_end_I(s->I, q, last + i);
322 set_D(s->D, q, s - 1);
323 s = curr; curr = last; last = s;
327 SET_INF(curr[j - b2]);
328 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
329 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
330 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
331 set_I(s->I, q, last + i);
332 set_end_D(s->D, q, s - 1);
334 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
335 set_end_I(s->I, q, last + i);
336 set_end_D(s->D, q, s - 1);
337 s = curr; curr = last; last = s;
340 *_score = last[len1].M;
341 if (n_cigar) { /* backtrace */
342 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
346 max = s->M; type = q->Mt; ctype = FROM_M;
347 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
348 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
351 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
355 case FROM_M: --i; --j; break;
356 case FROM_I: --j; break;
357 case FROM_D: --i; break;
362 case FROM_M: type = q->Mt; break;
363 case FROM_I: type = q->It; break;
364 case FROM_D: type = q->Dt; break;
366 p->ctype = ctype; p->i = i; p->j = j;
369 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
374 for (j = b2 + 1; j <= len2; ++j)
376 for (j = 0; j <= len2; ++j)
379 free(curr); free(last);
388 #define MINUS_INF -0x40000000
390 // matrix: len2 rows and len1 columns
391 int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
394 #define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
398 _p->M = _q->M >= _q->I? _q->M : _q->I; \
399 _p->M = _p->M >= _q->D? _p->M : _q->D; \
401 ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
402 _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
405 int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
406 const uint8_t *seq1, *seq2;
407 score_aux_t *curr, *last, *swap;
408 bw = abs(len1 - len2) + ap->band_width;
409 i = len1 > len2? len1 : len2;
410 if (bw > i + 1) bw = i + 1;
411 seq1 = _seq1 - 1; seq2 = _seq2 - 1;
412 curr = calloc(len1 + 2, sizeof(score_aux_t));
413 last = calloc(len1 + 2, sizeof(score_aux_t));
418 x = j + bw; end = len1 < x? len1 : x; // band end
420 p->M = 0; p->I = p->D = MINUS_INF;
421 for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
422 p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
423 p->M = p->I = p->D = MINUS_INF;
424 swap = curr; curr = last; last = swap;
426 for (j = 1; j < len2; ++j) {
427 int x, beg = 0, end = len1, *scrow, col_end;
429 x = j - bw; beg = 0 > x? 0 : x; // band start
430 x = j + bw; end = len1 < x? len1 : x; // band end
431 if (beg == 0) { // from zero-th column
433 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
434 ++beg; // then beg = 1
436 scrow = scmat + seq2[j] * scmat_size;
437 if (end == len1) col_end = 1, --end;
439 for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
440 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
442 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
445 p->M = p->I = p->D = MINUS_INF;
446 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
447 swap = curr; curr = last; last = swap;
450 int x, beg = 0, *scrow;
453 x = j - bw; beg = 0 > x? 0 : x; // band start
454 if (beg == 0) { // from zero-th column
456 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
457 ++beg; // then beg = 1
459 scrow = scmat + seq2[j] * scmat_size;
460 for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
461 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
462 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
463 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
465 ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
466 ret = ret >= curr[len1].D? ret : curr[len1].D;
467 free(curr); free(last);
472 int main(int argc, char *argv[])
474 // int len1 = 35, len2 = 35;
475 // uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
476 // uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
477 int len1 = 4, len2 = 4;
478 uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
479 uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
481 // ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
482 sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);