3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 int aln_sm_blosum62[] = {
42 /* A R N D C Q E G H I L K M F P S T W Y V * X */
43 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
67 int aln_sm_blast[] = {
75 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
82 unsigned char last_type;
84 if (path_len == 0 || path == 0) {
89 last_type = path->ctype;
90 for (i = n = 1; i < path_len; ++i) {
91 if (last_type != path[i].ctype) ++n;
92 last_type = path[i].ctype;
95 cigar = (uint32_t*)calloc(*n_cigar, 4);
97 cigar[0] = 1u << 4 | path[path_len-1].ctype;
98 last_type = path[path_len-1].ctype;
99 for (i = path_len - 2, n = 0; i >= 0; --i) {
100 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
102 cigar[++n] = 1u << 4 | path[i].ctype;
103 last_type = path[i].ctype;
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
116 #define set_M(MM, cur, p, sc) \
118 if ((p)->M >= (p)->I) { \
119 if ((p)->M >= (p)->D) { \
120 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
122 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
125 if ((p)->I > (p)->D) { \
126 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
128 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
132 #define set_I(II, cur, p) \
134 if ((p)->M - gap_open > (p)->I) { \
135 (cur)->It = FROM_M; \
136 (II) = (p)->M - gap_open - gap_ext; \
138 (cur)->It = FROM_I; \
139 (II) = (p)->I - gap_ext; \
142 #define set_end_I(II, cur, p) \
144 if (gap_end_ext >= 0) { \
145 if ((p)->M - gap_end_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_end_open - gap_end_ext; \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_end_ext; \
152 } else set_I(II, cur, p); \
154 #define set_D(DD, cur, p) \
156 if ((p)->M - gap_open > (p)->D) { \
157 (cur)->Dt = FROM_M; \
158 (DD) = (p)->M - gap_open - gap_ext; \
160 (cur)->Dt = FROM_D; \
161 (DD) = (p)->D - gap_ext; \
164 #define set_end_D(DD, cur, p) \
166 if (gap_end_ext >= 0) { \
167 if ((p)->M - gap_end_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_end_open - gap_end_ext; \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_end_ext; \
174 } else set_D(DD, cur, p); \
178 uint8_t Mt:3, It:2, Dt:2;
185 /***************************
186 * banded global alignment *
187 ***************************/
188 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
191 dpcell_t **dpcell, *q;
192 dpscore_t *curr, *last, *s;
194 int *mat, end, max = 0;
198 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
199 int *score_matrix, N_MATRIX_ROW;
201 /* initialize some align-related parameters. just for compatibility */
202 gap_open = ap->gap_open;
203 gap_ext = ap->gap_ext;
204 gap_end_open = ap->gap_end_open;
205 gap_end_ext = ap->gap_end_ext;
207 score_matrix = ap->matrix;
208 N_MATRIX_ROW = ap->row;
211 if (len1 == 0 || len2 == 0) return 0;
213 /* calculate b1 and b2 */
215 b1 = len1 - len2 + b;
219 b2 = len2 - len1 + b;
221 if (b1 > len1) b1 = len1;
222 if (b2 > len2) b2 = len2;
225 /* allocate memory */
226 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
227 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
228 for (j = 0; j <= len2; ++j)
229 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
230 for (j = b2 + 1; j <= len2; ++j)
232 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
233 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
236 SET_INF(*curr); curr->M = 0;
237 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
239 set_end_D(s->D, dpcell[0] + i, s - 1);
241 s = curr; curr = last; last = s;
243 /* core dynamic programming, part 1 */
244 tmp_end = (b2 < len2)? b2 : len2 - 1;
245 for (j = 1; j <= tmp_end; ++j) {
246 q = dpcell[j]; s = curr; SET_INF(*s);
247 set_end_I(s->I, q, last);
248 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
249 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
251 for (i = 1; i != end; ++i, ++s, ++q) {
252 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
253 set_I(s->I, q, last + i);
254 set_D(s->D, q, s - 1);
256 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
257 set_D(s->D, q, s - 1);
258 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
259 set_end_I(s->I, q, last + i);
260 } else s->I = MINOR_INF;
261 s = curr; curr = last; last = s;
263 /* last row for part 1, use set_end_D() instead of set_D() */
264 if (j == len2 && b2 != len2 - 1) {
265 q = dpcell[j]; s = curr; SET_INF(*s);
266 set_end_I(s->I, q, last);
267 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
268 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
270 for (i = 1; i != end; ++i, ++s, ++q) {
271 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
272 set_I(s->I, q, last + i);
273 set_end_D(s->D, q, s - 1);
275 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
276 set_end_D(s->D, q, s - 1);
277 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
278 set_end_I(s->I, q, last + i);
279 } else s->I = MINOR_INF;
280 s = curr; curr = last; last = s;
284 /* core dynamic programming, part 2 */
285 for (; j <= len2 - b2 + 1; ++j) {
286 SET_INF(curr[j - b2]);
287 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
289 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
290 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
291 set_I(s->I, q, last + i);
292 set_D(s->D, q, s - 1);
294 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
295 set_D(s->D, q, s - 1);
297 s = curr; curr = last; last = s;
300 /* core dynamic programming, part 3 */
301 for (; j < len2; ++j) {
302 SET_INF(curr[j - b2]);
303 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
304 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
305 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
306 set_I(s->I, q, last + i);
307 set_D(s->D, q, s - 1);
309 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
310 set_end_I(s->I, q, last + i);
311 set_D(s->D, q, s - 1);
312 s = curr; curr = last; last = s;
316 SET_INF(curr[j - b2]);
317 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
318 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
319 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
320 set_I(s->I, q, last + i);
321 set_end_D(s->D, q, s - 1);
323 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
324 set_end_I(s->I, q, last + i);
325 set_end_D(s->D, q, s - 1);
326 s = curr; curr = last; last = s;
329 *_score = last[len1].M;
330 if (n_cigar) { /* backtrace */
331 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
335 max = s->M; type = q->Mt; ctype = FROM_M;
336 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
337 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
340 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
344 case FROM_M: --i; --j; break;
345 case FROM_I: --j; break;
346 case FROM_D: --i; break;
351 case FROM_M: type = q->Mt; break;
352 case FROM_I: type = q->It; break;
353 case FROM_D: type = q->Dt; break;
355 p->ctype = ctype; p->i = i; p->j = j;
358 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
363 for (j = b2 + 1; j <= len2; ++j)
365 for (j = 0; j <= len2; ++j)
368 free(curr); free(last);