3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 int aln_sm_blosum62[] = {
42 /* A R N D C Q E G H I L K M F P S T W Y V * X */
43 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
67 int aln_sm_blast[] = {
75 ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
82 unsigned char last_type;
84 if (path_len == 0 || path == 0) {
89 last_type = path->ctype;
90 for (i = n = 1; i < path_len; ++i) {
91 if (last_type != path[i].ctype) ++n;
92 last_type = path[i].ctype;
95 cigar = (uint32_t*)calloc(*n_cigar, 4);
97 cigar[0] = 1u << 4 | path[path_len-1].ctype;
98 last_type = path[path_len-1].ctype;
99 for (i = path_len - 2, n = 0; i >= 0; --i) {
100 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
102 cigar[++n] = 1u << 4 | path[i].ctype;
103 last_type = path[i].ctype;
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
116 #define set_M(MM, cur, p, sc) \
118 if ((p)->M >= (p)->I) { \
119 if ((p)->M >= (p)->D) { \
120 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
122 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
125 if ((p)->I > (p)->D) { \
126 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
128 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
132 #define set_I(II, cur, p) \
134 if ((p)->M - gap_open > (p)->I) { \
135 (cur)->It = FROM_M; \
136 (II) = (p)->M - gap_open - gap_ext; \
138 (cur)->It = FROM_I; \
139 (II) = (p)->I - gap_ext; \
142 #define set_end_I(II, cur, p) \
144 if (gap_end >= 0) { \
145 if ((p)->M - gap_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_open - gap_end; \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_end; \
152 } else set_I(II, cur, p); \
154 #define set_D(DD, cur, p) \
156 if ((p)->M - gap_open > (p)->D) { \
157 (cur)->Dt = FROM_M; \
158 (DD) = (p)->M - gap_open - gap_ext; \
160 (cur)->Dt = FROM_D; \
161 (DD) = (p)->D - gap_ext; \
164 #define set_end_D(DD, cur, p) \
166 if (gap_end >= 0) { \
167 if ((p)->M - gap_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_open - gap_end; \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_end; \
174 } else set_D(DD, cur, p); \
178 uint8_t Mt:3, It:2, Dt:2;
185 /* build score profile for accelerating alignment, in theory */
186 static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
188 int *tmp, *tmp2, i, k;
189 for (i = 0; i != row; ++i) {
190 tmp = score_matrix + i * row;
192 for (k = 0; k != len; ++k)
193 tmp2[k] = tmp[seq[k]];
196 /***************************
197 * banded global alignment *
198 ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
202 dpcell_t **dpcell, *q;
203 dpscore_t *curr, *last, *s;
205 int *mat, end, max = 0;
209 int gap_open, gap_ext, gap_end, b;
210 int *score_matrix, N_MATRIX_ROW;
212 /* initialize some align-related parameters. just for compatibility */
213 gap_open = ap->gap_open;
214 gap_ext = ap->gap_ext;
215 gap_end = ap->gap_end;
217 score_matrix = ap->matrix;
218 N_MATRIX_ROW = ap->row;
221 if (len1 == 0 || len2 == 0) return 0;
223 /* calculate b1 and b2 */
225 b1 = len1 - len2 + b;
229 b2 = len2 - len1 + b;
231 if (b1 > len1) b1 = len1;
232 if (b2 > len2) b2 = len2;
235 /* allocate memory */
236 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
237 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
238 for (j = 0; j <= len2; ++j)
239 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
240 for (j = b2 + 1; j <= len2; ++j)
242 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
243 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
246 SET_INF(*curr); curr->M = 0;
247 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
249 set_end_D(s->D, dpcell[0] + i, s - 1);
251 s = curr; curr = last; last = s;
253 /* core dynamic programming, part 1 */
254 tmp_end = (b2 < len2)? b2 : len2 - 1;
255 for (j = 1; j <= tmp_end; ++j) {
256 q = dpcell[j]; s = curr; SET_INF(*s);
257 set_end_I(s->I, q, last);
258 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
259 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
261 for (i = 1; i != end; ++i, ++s, ++q) {
262 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
263 set_I(s->I, q, last + i);
264 set_D(s->D, q, s - 1);
266 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
267 set_D(s->D, q, s - 1);
268 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
269 set_end_I(s->I, q, last + i);
270 } else s->I = MINOR_INF;
271 s = curr; curr = last; last = s;
273 /* last row for part 1, use set_end_D() instead of set_D() */
274 if (j == len2 && b2 != len2 - 1) {
275 q = dpcell[j]; s = curr; SET_INF(*s);
276 set_end_I(s->I, q, last);
277 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
278 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
280 for (i = 1; i != end; ++i, ++s, ++q) {
281 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
282 set_I(s->I, q, last + i);
283 set_end_D(s->D, q, s - 1);
285 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
286 set_end_D(s->D, q, s - 1);
287 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
288 set_end_I(s->I, q, last + i);
289 } else s->I = MINOR_INF;
290 s = curr; curr = last; last = s;
294 /* core dynamic programming, part 2 */
295 for (; j <= len2 - b2 + 1; ++j) {
296 SET_INF(curr[j - b2]);
297 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
299 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
300 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
301 set_I(s->I, q, last + i);
302 set_D(s->D, q, s - 1);
304 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
305 set_D(s->D, q, s - 1);
307 s = curr; curr = last; last = s;
310 /* core dynamic programming, part 3 */
311 for (; j < len2; ++j) {
312 SET_INF(curr[j - b2]);
313 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
314 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
315 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
316 set_I(s->I, q, last + i);
317 set_D(s->D, q, s - 1);
319 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
320 set_end_I(s->I, q, last + i);
321 set_D(s->D, q, s - 1);
322 s = curr; curr = last; last = s;
326 SET_INF(curr[j - b2]);
327 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
328 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
329 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
330 set_I(s->I, q, last + i);
331 set_end_D(s->D, q, s - 1);
333 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
334 set_end_I(s->I, q, last + i);
335 set_end_D(s->D, q, s - 1);
336 s = curr; curr = last; last = s;
339 *_score = last[len1].M;
340 if (n_cigar) { /* backtrace */
341 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
345 max = s->M; type = q->Mt; ctype = FROM_M;
346 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
347 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
350 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
354 case FROM_M: --i; --j; break;
355 case FROM_I: --j; break;
356 case FROM_D: --i; break;
361 case FROM_M: type = q->Mt; break;
362 case FROM_I: type = q->It; break;
363 case FROM_D: type = q->Dt; break;
365 p->ctype = ctype; p->i = i; p->j = j;
368 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
373 for (j = b2 + 1; j <= len2; ++j)
375 for (j = 0; j <= len2; ++j)
378 free(curr); free(last);