3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 int aln_sm_blosum62[] = {
43 /* A R N D C Q E G H I L K M F P S T W Y V * X */
44 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
45 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
46 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
47 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
48 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
49 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
50 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
51 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
52 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
53 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
54 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
55 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
56 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
57 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
58 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
59 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
60 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
61 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
62 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
63 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
64 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
65 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
68 int aln_sm_blast[] = {
76 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
77 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
79 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
83 unsigned char last_type;
85 if (path_len == 0 || path == 0) {
90 last_type = path->ctype;
91 for (i = n = 1; i < path_len; ++i) {
92 if (last_type != path[i].ctype) ++n;
93 last_type = path[i].ctype;
96 cigar = (uint32_t*)calloc(*n_cigar, 4);
98 cigar[0] = 1u << 4 | path[path_len-1].ctype;
99 last_type = path[path_len-1].ctype;
100 for (i = path_len - 2, n = 0; i >= 0; --i) {
101 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
103 cigar[++n] = 1u << 4 | path[i].ctype;
104 last_type = path[i].ctype;
111 /***************************/
112 /* START OF common_align.c */
113 /***************************/
115 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
117 #define set_M(MM, cur, p, sc) \
119 if ((p)->M >= (p)->I) { \
120 if ((p)->M >= (p)->D) { \
121 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
123 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
126 if ((p)->I > (p)->D) { \
127 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
129 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
133 #define set_I(II, cur, p) \
135 if ((p)->M - gap_open > (p)->I) { \
136 (cur)->It = FROM_M; \
137 (II) = (p)->M - gap_open - gap_ext; \
139 (cur)->It = FROM_I; \
140 (II) = (p)->I - gap_ext; \
143 #define set_end_I(II, cur, p) \
145 if (gap_end_ext >= 0) { \
146 if ((p)->M - gap_end_open > (p)->I) { \
147 (cur)->It = FROM_M; \
148 (II) = (p)->M - gap_end_open - gap_end_ext; \
150 (cur)->It = FROM_I; \
151 (II) = (p)->I - gap_end_ext; \
153 } else set_I(II, cur, p); \
155 #define set_D(DD, cur, p) \
157 if ((p)->M - gap_open > (p)->D) { \
158 (cur)->Dt = FROM_M; \
159 (DD) = (p)->M - gap_open - gap_ext; \
161 (cur)->Dt = FROM_D; \
162 (DD) = (p)->D - gap_ext; \
165 #define set_end_D(DD, cur, p) \
167 if (gap_end_ext >= 0) { \
168 if ((p)->M - gap_end_open > (p)->D) { \
169 (cur)->Dt = FROM_M; \
170 (DD) = (p)->M - gap_end_open - gap_end_ext; \
172 (cur)->Dt = FROM_D; \
173 (DD) = (p)->D - gap_end_ext; \
175 } else set_D(DD, cur, p); \
179 uint8_t Mt:3, It:2, Dt:2;
186 /***************************
187 * banded global alignment *
188 ***************************/
189 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
192 dpcell_t **dpcell, *q;
193 dpscore_t *curr, *last, *s;
195 int *mat, end, max = 0;
199 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
200 int *score_matrix, N_MATRIX_ROW;
202 /* initialize some align-related parameters. just for compatibility */
203 gap_open = ap->gap_open;
204 gap_ext = ap->gap_ext;
205 gap_end_open = ap->gap_end_open;
206 gap_end_ext = ap->gap_end_ext;
208 score_matrix = ap->matrix;
209 N_MATRIX_ROW = ap->row;
212 if (len1 == 0 || len2 == 0) return 0;
214 /* calculate b1 and b2 */
216 b1 = len1 - len2 + b;
220 b2 = len2 - len1 + b;
222 if (b1 > len1) b1 = len1;
223 if (b2 > len2) b2 = len2;
226 /* allocate memory */
227 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
228 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
229 for (j = 0; j <= len2; ++j)
230 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
231 for (j = b2 + 1; j <= len2; ++j)
233 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
234 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
237 SET_INF(*curr); curr->M = 0;
238 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
240 set_end_D(s->D, dpcell[0] + i, s - 1);
242 s = curr; curr = last; last = s;
244 /* core dynamic programming, part 1 */
245 tmp_end = (b2 < len2)? b2 : len2 - 1;
246 for (j = 1; j <= tmp_end; ++j) {
247 q = dpcell[j]; s = curr; SET_INF(*s);
248 set_end_I(s->I, q, last);
249 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
250 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
252 for (i = 1; i != end; ++i, ++s, ++q) {
253 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
254 set_I(s->I, q, last + i);
255 set_D(s->D, q, s - 1);
257 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
258 set_D(s->D, q, s - 1);
259 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
260 set_end_I(s->I, q, last + i);
261 } else s->I = MINOR_INF;
262 s = curr; curr = last; last = s;
264 /* last row for part 1, use set_end_D() instead of set_D() */
265 if (j == len2 && b2 != len2 - 1) {
266 q = dpcell[j]; s = curr; SET_INF(*s);
267 set_end_I(s->I, q, last);
268 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
269 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
271 for (i = 1; i != end; ++i, ++s, ++q) {
272 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
273 set_I(s->I, q, last + i);
274 set_end_D(s->D, q, s - 1);
276 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
277 set_end_D(s->D, q, s - 1);
278 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
279 set_end_I(s->I, q, last + i);
280 } else s->I = MINOR_INF;
281 s = curr; curr = last; last = s;
285 /* core dynamic programming, part 2 */
286 for (; j <= len2 - b2 + 1; ++j) {
287 SET_INF(curr[j - b2]);
288 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
290 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
291 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
292 set_I(s->I, q, last + i);
293 set_D(s->D, q, s - 1);
295 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
296 set_D(s->D, q, s - 1);
298 s = curr; curr = last; last = s;
301 /* core dynamic programming, part 3 */
302 for (; j < len2; ++j) {
303 SET_INF(curr[j - b2]);
304 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
305 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
306 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
307 set_I(s->I, q, last + i);
308 set_D(s->D, q, s - 1);
310 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
311 set_end_I(s->I, q, last + i);
312 set_D(s->D, q, s - 1);
313 s = curr; curr = last; last = s;
317 SET_INF(curr[j - b2]);
318 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
319 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
320 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
321 set_I(s->I, q, last + i);
322 set_end_D(s->D, q, s - 1);
324 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
325 set_end_I(s->I, q, last + i);
326 set_end_D(s->D, q, s - 1);
327 s = curr; curr = last; last = s;
330 *_score = last[len1].M;
331 if (n_cigar) { /* backtrace */
332 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
336 max = s->M; type = q->Mt; ctype = FROM_M;
337 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
338 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
341 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
345 case FROM_M: --i; --j; break;
346 case FROM_I: --j; break;
347 case FROM_D: --i; break;
352 case FROM_M: type = q->Mt; break;
353 case FROM_I: type = q->It; break;
354 case FROM_D: type = q->Dt; break;
356 p->ctype = ctype; p->i = i; p->j = j;
359 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
364 for (j = b2 + 1; j <= len2; ++j)
366 for (j = 0; j <= len2; ++j)
369 free(curr); free(last);
374 /***************************
375 * Probabilistic extension *
376 ***************************/
378 #define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
380 ka_probpar_t ka_probpar_def = { 0.0001, 0.1, 1 };
386 I[0] I[1] I[k-1] I[k] I[L]
387 ^ \ ^ \ \ ^ \ ^ \ \ ^
388 | \ | \ \ | \ | \ \ |
389 M[0] -> M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
392 D[1] -> -> D[k-1] -> D[k] ->
395 Every {M,I}[k], k=0..L connects M[L+1] at the same probability, while
396 no D[k], k=1..L-1 connects M[L+1]. This means an alignment can end up
397 with M or I but not D.
399 Deletions are dumb states which do not emit residues. Frankly, I am
400 not sure if they are handled properly in the following
401 implementation. This is a potential concern to be resolved in future.
403 int ka_prob_extend(uint8_t *_ref, int l_ref, uint8_t *_query, int l_query, float *_qual,
404 const ka_probpar_t *c, int *state, uint8_t *q)
406 double **f, **b, m[9], f_sink, sI, sM, pf, pb, pD;
408 uint8_t *ref, *query;
409 int bw, bw2, i, k, is_diff = 0;
411 // FIXME: floating point underflow
413 /*** initialization ***/
414 ref = _ref - 1; // change to 1-based coordinate
419 // allocate forward and backward matrices f[][] and b[][]
420 f = calloc(l_query+1, sizeof(void*));
421 b = calloc(l_query+1, sizeof(void*));
422 for (i = 0; i <= l_query; ++i) {
423 f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
424 b[i] = calloc(bw2 * 3 + 6, sizeof(double));
426 // initialize transition probability
427 sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
428 m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
429 m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
430 m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
436 int beg = 1, end = l_ref, x;
437 x = 0 - bw; beg = beg > x? beg : x;
438 x = 0 + bw; end = end < x? end : x;
439 for (k = beg; k <= end; ++k) {
441 set_u(u, bw, 0, k); set_u(v01, bw, 0, k-1);
442 f[0][u+2] = m[2] * f[0][v01+0] + m[8] * f[0][v01+2];
445 // f[1..l_query]; core loop
446 for (i = 1; i <= l_query; ++i) {
447 double *fi = f[i], *fi1 = f[i-1];
448 int beg = 0, end = l_ref, x;
449 x = i - bw; beg = beg > x? beg : x; // band start
450 x = i + bw; end = end < x? end : x; // band end
451 for (k = beg; k <= end; ++k) {
452 int u, v11, v01, v10;
454 // FIXME: the following line can be optimized without branching!
455 e = (ref[k] > 3 || query[i] > 3)? 1. : ref[k] == query[i]? 1. - qual[i] : qual[i] / 3.;
456 set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
457 fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
458 fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
459 fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
460 // fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
464 for (k = 0, f_sink = 0.; k <= l_ref; ++k) {
466 set_u(u, bw, l_query, k);
467 if (u < 3 || u >= bw2*3+3) continue;
468 f_sink += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
470 pf = f_sink; // this P(x) calculated by the forward algorithm
473 for (k = 0; k <= l_ref; ++k) {
475 double *bi = b[l_query];
476 set_u(u, bw, l_query, k);
477 if (u < 3 || u >= bw2*3+3) continue;
478 bi[u+0] = sM; bi[u+1] = sI;
480 // b[l_query-1..1]; core loop
481 for (i = l_query - 1; i >= 0; --i) {
482 int beg = 0, end = l_ref, x;
483 double *bi = b[i], *bi1 = b[i+1];
484 x = i - bw; beg = beg > x? beg : x;
485 x = i + bw; end = end < x? end : x;
486 for (k = end; k >= beg; --k) {
487 int u, v11, v01, v10;
489 e = k >= l_ref? 0 : (ref[k+1] > 3 || query[i+1] > 3)? 1. : ref[k+1] == query[i+1]? 1. - qual[i+1] : qual[i+1] / 3.;
490 set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
491 bi[u+0] = e * m[0] * bi1[v11+0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2];
492 bi[u+1] = e * m[3] * bi1[v11+0] + .25 * m[4] * bi1[v10+1];
493 bi[u+2] = e * m[6] * bi1[v11+0] + m[8] * bi[v01+2];
494 // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
500 is_diff = fabs(pb/pf - 1.) > 1e-7? 1 : 0;
502 for (i = 1; i <= l_query; ++i) {
503 double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
504 int beg = 0, end = l_ref, x, max_k = -1;
505 x = i - bw; beg = beg > x? beg : x;
506 x = i + bw; end = end < x? end : x;
507 for (k = beg; k <= end; ++k) {
511 z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = k<<2 | 0; sum += z;
512 z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = k<<2 | 1; sum += z;
514 max /= sum; sum /= pD; // if everything works as is expected, sum == 1.0
515 if (state) state[i-1] = max_k;
516 if (q) q[i-1] = -4.343 * log(1. - max);
518 fprintf(stderr, "[%d,%.10lg,%.10lg],%d,%lg,%d:%d,%lg\n", is_diff, pf, pb, i, sum, max_k>>2, max_k&3, max);
522 for (i = 0; i <= l_query; ++i) {
523 free(f[i]); free(b[i]);
532 int l_ref = 5, l_query = 3;
533 uint8_t *ref = (uint8_t*)"\0\1\3\3\1";
534 // uint8_t *query = (uint8_t*)"\0\3\3\1";
535 uint8_t *query = (uint8_t*)"\1\3\3\1"; // FIXME: the output is not so right given this input!!!
536 static float qual[4] = {.01, .01, .01, .01};
537 ka_prob_extend(ref, l_ref, query, l_query, qual, &ka_probpar_def, 0, 0);