3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 int aln_sm_blosum62[] = {
42 /* A R N D C Q E G H I L K M F P S T W Y V * X */
43 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
67 int aln_sm_blast[] = {
75 ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
82 unsigned char last_type;
84 if (path_len == 0 || path == 0) {
89 last_type = path->ctype;
90 for (i = n = 1; i < path_len; ++i) {
91 if (last_type != path[i].ctype) ++n;
92 last_type = path[i].ctype;
95 cigar = (uint32_t*)calloc(*n_cigar, 4);
97 cigar[0] = 1u << 4 | path[path_len-1].ctype;
98 last_type = path[path_len-1].ctype;
99 for (i = path_len - 2, n = 0; i >= 0; --i) {
100 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
102 cigar[++n] = 1u << 4 | path[i].ctype;
103 last_type = path[i].ctype;
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
116 #define set_M(MM, cur, p, sc) \
118 if ((p)->M >= (p)->I) { \
119 if ((p)->M >= (p)->D) { \
120 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
122 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
125 if ((p)->I > (p)->D) { \
126 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
128 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
132 #define set_I(II, cur, p) \
134 if ((p)->M - gap_open > (p)->I) { \
135 (cur)->It = FROM_M; \
136 (II) = (p)->M - gap_open - gap_ext; \
138 (cur)->It = FROM_I; \
139 (II) = (p)->I - gap_ext; \
142 #define set_end_I(II, cur, p) \
144 if (gap_end >= 0) { \
145 if ((p)->M - gap_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_open - gap_end; \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_end; \
152 } else set_I(II, cur, p); \
154 #define set_D(DD, cur, p) \
156 if ((p)->M - gap_open > (p)->D) { \
157 (cur)->Dt = FROM_M; \
158 (DD) = (p)->M - gap_open - gap_ext; \
160 (cur)->Dt = FROM_D; \
161 (DD) = (p)->D - gap_ext; \
164 #define set_end_D(DD, cur, p) \
166 if (gap_end >= 0) { \
167 if ((p)->M - gap_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_open - gap_end; \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_end; \
174 } else set_D(DD, cur, p); \
178 uint8_t Mt:3, It:2, Dt:2;
185 /* build score profile for accelerating alignment, in theory */
186 static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
188 int *tmp, *tmp2, i, k;
189 for (i = 0; i != row; ++i) {
190 tmp = score_matrix + i * row;
192 for (k = 0; k != len; ++k)
193 tmp2[k] = tmp[seq[k]];
196 /***************************
197 * banded global alignment *
198 ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
202 dpcell_t **dpcell, *q;
203 dpscore_t *curr, *last, *s;
205 int *mat, end, max = 0;
209 int gap_open, gap_ext, gap_end, b;
210 int *score_matrix, N_MATRIX_ROW;
212 /* initialize some align-related parameters. just for compatibility */
213 gap_open = ap->gap_open;
214 gap_ext = ap->gap_ext;
215 gap_end = ap->gap_end;
217 score_matrix = ap->matrix;
218 N_MATRIX_ROW = ap->row;
220 if (len1 == 0 || len2 == 0) return 0;
222 /* calculate b1 and b2 */
224 b1 = len1 - len2 + b;
228 b2 = len2 - len1 + b;
230 if (b1 > len1) b1 = len1;
231 if (b2 > len2) b2 = len2;
234 /* allocate memory */
235 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
236 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
237 for (j = 0; j <= len2; ++j)
238 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
239 for (j = b2 + 1; j <= len2; ++j)
241 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
242 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
245 SET_INF(*curr); curr->M = 0;
246 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
248 set_end_D(s->D, dpcell[0] + i, s - 1);
250 s = curr; curr = last; last = s;
252 /* core dynamic programming, part 1 */
253 tmp_end = (b2 < len2)? b2 : len2 - 1;
254 for (j = 1; j <= tmp_end; ++j) {
255 q = dpcell[j]; s = curr; SET_INF(*s);
256 set_end_I(s->I, q, last);
257 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
258 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
260 for (i = 1; i != end; ++i, ++s, ++q) {
261 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
262 set_I(s->I, q, last + i);
263 set_D(s->D, q, s - 1);
265 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
266 set_D(s->D, q, s - 1);
267 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
268 set_end_I(s->I, q, last + i);
269 } else s->I = MINOR_INF;
270 s = curr; curr = last; last = s;
272 /* last row for part 1, use set_end_D() instead of set_D() */
273 if (j == len2 && b2 != len2 - 1) {
274 q = dpcell[j]; s = curr; SET_INF(*s);
275 set_end_I(s->I, q, last);
276 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
277 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
279 for (i = 1; i != end; ++i, ++s, ++q) {
280 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
281 set_I(s->I, q, last + i);
282 set_end_D(s->D, q, s - 1);
284 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
285 set_end_D(s->D, q, s - 1);
286 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
287 set_end_I(s->I, q, last + i);
288 } else s->I = MINOR_INF;
289 s = curr; curr = last; last = s;
293 /* core dynamic programming, part 2 */
294 for (; j <= len2 - b2 + 1; ++j) {
295 SET_INF(curr[j - b2]);
296 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
298 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
299 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
300 set_I(s->I, q, last + i);
301 set_D(s->D, q, s - 1);
303 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
304 set_D(s->D, q, s - 1);
306 s = curr; curr = last; last = s;
309 /* core dynamic programming, part 3 */
310 for (; j < len2; ++j) {
311 SET_INF(curr[j - b2]);
312 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
313 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
314 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
315 set_I(s->I, q, last + i);
316 set_D(s->D, q, s - 1);
318 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
319 set_end_I(s->I, q, last + i);
320 set_D(s->D, q, s - 1);
321 s = curr; curr = last; last = s;
325 SET_INF(curr[j - b2]);
326 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
327 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
328 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
329 set_I(s->I, q, last + i);
330 set_end_D(s->D, q, s - 1);
332 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
333 set_end_I(s->I, q, last + i);
334 set_end_D(s->D, q, s - 1);
335 s = curr; curr = last; last = s;
338 *_score = last[len1].M;
339 if (n_cigar) { /* backtrace */
340 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
344 max = s->M; type = q->Mt; ctype = FROM_M;
345 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
346 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
349 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
353 case FROM_M: --i; --j; break;
354 case FROM_I: --j; break;
355 case FROM_D: --i; break;
360 case FROM_M: type = q->Mt; break;
361 case FROM_I: type = q->It; break;
362 case FROM_D: type = q->Dt; break;
364 p->ctype = ctype; p->i = i; p->j = j;
367 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
372 for (j = b2 + 1; j <= len2; ++j)
374 for (j = 0; j <= len2; ++j)
377 free(curr); free(last);