3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 int aln_sm_blosum62[] = {
42 /* A R N D C Q E G H I L K M F P S T W Y V * X */
43 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
67 int aln_sm_blast[] = {
75 ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
82 unsigned char last_type;
84 if (path_len == 0 || path == 0) {
89 last_type = path->ctype;
90 for (i = n = 1; i < path_len; ++i) {
91 if (last_type != path[i].ctype) ++n;
92 last_type = path[i].ctype;
95 cigar = (uint32_t*)calloc(*n_cigar, 4);
97 cigar[0] = 1u << 4 | path[path_len-1].ctype;
98 last_type = path[path_len-1].ctype;
99 for (i = path_len - 2, n = 0; i >= 0; --i) {
100 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
102 cigar[++n] = 1u << 4 | path[i].ctype;
103 last_type = path[i].ctype;
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
116 #define set_M(MM, cur, p, sc) \
118 if ((p)->M >= (p)->I) { \
119 if ((p)->M >= (p)->D) { \
120 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
122 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
125 if ((p)->I > (p)->D) { \
126 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
128 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
132 #define set_I(II, cur, p) \
134 if ((p)->M - gap_open > (p)->I) { \
135 (cur)->It = FROM_M; \
136 (II) = (p)->M - gap_open - gap_ext; \
138 (cur)->It = FROM_I; \
139 (II) = (p)->I - gap_ext; \
142 #define set_end_I(II, cur, p) \
144 if (gap_end >= 0) { \
145 if ((p)->M - gap_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_open - gap_end; \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_end; \
152 } else set_I(II, cur, p); \
154 #define set_D(DD, cur, p) \
156 if ((p)->M - gap_open > (p)->D) { \
157 (cur)->Dt = FROM_M; \
158 (DD) = (p)->M - gap_open - gap_ext; \
160 (cur)->Dt = FROM_D; \
161 (DD) = (p)->D - gap_ext; \
164 #define set_end_D(DD, cur, p) \
166 if (gap_end >= 0) { \
167 if ((p)->M - gap_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_open - gap_end; \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_end; \
174 } else set_D(DD, cur, p); \
178 uint8_t Mt:3, It:2, Dt:2;
185 /***************************
186 * banded global alignment *
187 ***************************/
188 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
191 dpcell_t **dpcell, *q;
192 dpscore_t *curr, *last, *s;
194 int *mat, end, max = 0;
198 int gap_open, gap_ext, gap_end, b;
199 int *score_matrix, N_MATRIX_ROW;
201 /* initialize some align-related parameters. just for compatibility */
202 gap_open = ap->gap_open;
203 gap_ext = ap->gap_ext;
204 gap_end = ap->gap_end;
206 score_matrix = ap->matrix;
207 N_MATRIX_ROW = ap->row;
210 if (len1 == 0 || len2 == 0) return 0;
212 /* calculate b1 and b2 */
214 b1 = len1 - len2 + b;
218 b2 = len2 - len1 + b;
220 if (b1 > len1) b1 = len1;
221 if (b2 > len2) b2 = len2;
224 /* allocate memory */
225 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
226 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
227 for (j = 0; j <= len2; ++j)
228 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
229 for (j = b2 + 1; j <= len2; ++j)
231 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
232 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
235 SET_INF(*curr); curr->M = 0;
236 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
238 set_end_D(s->D, dpcell[0] + i, s - 1);
240 s = curr; curr = last; last = s;
242 /* core dynamic programming, part 1 */
243 tmp_end = (b2 < len2)? b2 : len2 - 1;
244 for (j = 1; j <= tmp_end; ++j) {
245 q = dpcell[j]; s = curr; SET_INF(*s);
246 set_end_I(s->I, q, last);
247 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
248 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
250 for (i = 1; i != end; ++i, ++s, ++q) {
251 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
252 set_I(s->I, q, last + i);
253 set_D(s->D, q, s - 1);
255 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
256 set_D(s->D, q, s - 1);
257 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
258 set_end_I(s->I, q, last + i);
259 } else s->I = MINOR_INF;
260 s = curr; curr = last; last = s;
262 /* last row for part 1, use set_end_D() instead of set_D() */
263 if (j == len2 && b2 != len2 - 1) {
264 q = dpcell[j]; s = curr; SET_INF(*s);
265 set_end_I(s->I, q, last);
266 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
267 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
269 for (i = 1; i != end; ++i, ++s, ++q) {
270 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
271 set_I(s->I, q, last + i);
272 set_end_D(s->D, q, s - 1);
274 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
275 set_end_D(s->D, q, s - 1);
276 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
277 set_end_I(s->I, q, last + i);
278 } else s->I = MINOR_INF;
279 s = curr; curr = last; last = s;
283 /* core dynamic programming, part 2 */
284 for (; j <= len2 - b2 + 1; ++j) {
285 SET_INF(curr[j - b2]);
286 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
288 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
289 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
290 set_I(s->I, q, last + i);
291 set_D(s->D, q, s - 1);
293 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
294 set_D(s->D, q, s - 1);
296 s = curr; curr = last; last = s;
299 /* core dynamic programming, part 3 */
300 for (; j < len2; ++j) {
301 SET_INF(curr[j - b2]);
302 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
303 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
304 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
305 set_I(s->I, q, last + i);
306 set_D(s->D, q, s - 1);
308 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
309 set_end_I(s->I, q, last + i);
310 set_D(s->D, q, s - 1);
311 s = curr; curr = last; last = s;
315 SET_INF(curr[j - b2]);
316 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
317 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
318 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
319 set_I(s->I, q, last + i);
320 set_end_D(s->D, q, s - 1);
322 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
323 set_end_I(s->I, q, last + i);
324 set_end_D(s->D, q, s - 1);
325 s = curr; curr = last; last = s;
328 *_score = last[len1].M;
329 if (n_cigar) { /* backtrace */
330 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
334 max = s->M; type = q->Mt; ctype = FROM_M;
335 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
336 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
339 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
343 case FROM_M: --i; --j; break;
344 case FROM_I: --j; break;
345 case FROM_D: --i; break;
350 case FROM_M: type = q->Mt; break;
351 case FROM_I: type = q->It; break;
352 case FROM_D: type = q->Dt; break;
354 p->ctype = ctype; p->i = i; p->j = j;
357 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
362 for (j = b2 + 1; j <= len2; ++j)
364 for (j = 0; j <= len2; ++j)
367 free(curr); free(last);