3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 int aln_sm_blosum62[] = {
43 /* A R N D C Q E G H I L K M F P S T W Y V * X */
44 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
45 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
46 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
47 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
48 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
49 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
50 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
51 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
52 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
53 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
54 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
55 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
56 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
57 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
58 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
59 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
60 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
61 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
62 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
63 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
64 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
65 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
68 int aln_sm_blast[] = {
76 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
77 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
79 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
83 unsigned char last_type;
85 if (path_len == 0 || path == 0) {
90 last_type = path->ctype;
91 for (i = n = 1; i < path_len; ++i) {
92 if (last_type != path[i].ctype) ++n;
93 last_type = path[i].ctype;
96 cigar = (uint32_t*)calloc(*n_cigar, 4);
98 cigar[0] = 1u << 4 | path[path_len-1].ctype;
99 last_type = path[path_len-1].ctype;
100 for (i = path_len - 2, n = 0; i >= 0; --i) {
101 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
103 cigar[++n] = 1u << 4 | path[i].ctype;
104 last_type = path[i].ctype;
111 /***************************/
112 /* START OF common_align.c */
113 /***************************/
115 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
117 #define set_M(MM, cur, p, sc) \
119 if ((p)->M >= (p)->I) { \
120 if ((p)->M >= (p)->D) { \
121 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
123 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
126 if ((p)->I > (p)->D) { \
127 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
129 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
133 #define set_I(II, cur, p) \
135 if ((p)->M - gap_open > (p)->I) { \
136 (cur)->It = FROM_M; \
137 (II) = (p)->M - gap_open - gap_ext; \
139 (cur)->It = FROM_I; \
140 (II) = (p)->I - gap_ext; \
143 #define set_end_I(II, cur, p) \
145 if (gap_end_ext >= 0) { \
146 if ((p)->M - gap_end_open > (p)->I) { \
147 (cur)->It = FROM_M; \
148 (II) = (p)->M - gap_end_open - gap_end_ext; \
150 (cur)->It = FROM_I; \
151 (II) = (p)->I - gap_end_ext; \
153 } else set_I(II, cur, p); \
155 #define set_D(DD, cur, p) \
157 if ((p)->M - gap_open > (p)->D) { \
158 (cur)->Dt = FROM_M; \
159 (DD) = (p)->M - gap_open - gap_ext; \
161 (cur)->Dt = FROM_D; \
162 (DD) = (p)->D - gap_ext; \
165 #define set_end_D(DD, cur, p) \
167 if (gap_end_ext >= 0) { \
168 if ((p)->M - gap_end_open > (p)->D) { \
169 (cur)->Dt = FROM_M; \
170 (DD) = (p)->M - gap_end_open - gap_end_ext; \
172 (cur)->Dt = FROM_D; \
173 (DD) = (p)->D - gap_end_ext; \
175 } else set_D(DD, cur, p); \
179 uint8_t Mt:3, It:2, Dt:2;
186 /***************************
187 * banded global alignment *
188 ***************************/
189 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
192 dpcell_t **dpcell, *q;
193 dpscore_t *curr, *last, *s;
195 int *mat, end, max = 0;
199 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
200 int *score_matrix, N_MATRIX_ROW;
202 /* initialize some align-related parameters. just for compatibility */
203 gap_open = ap->gap_open;
204 gap_ext = ap->gap_ext;
205 gap_end_open = ap->gap_end_open;
206 gap_end_ext = ap->gap_end_ext;
208 score_matrix = ap->matrix;
209 N_MATRIX_ROW = ap->row;
212 if (len1 == 0 || len2 == 0) return 0;
214 /* calculate b1 and b2 */
216 b1 = len1 - len2 + b;
220 b2 = len2 - len1 + b;
222 if (b1 > len1) b1 = len1;
223 if (b2 > len2) b2 = len2;
226 /* allocate memory */
227 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
228 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
229 for (j = 0; j <= len2; ++j)
230 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
231 for (j = b2 + 1; j <= len2; ++j)
233 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
234 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
237 SET_INF(*curr); curr->M = 0;
238 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
240 set_end_D(s->D, dpcell[0] + i, s - 1);
242 s = curr; curr = last; last = s;
244 /* core dynamic programming, part 1 */
245 tmp_end = (b2 < len2)? b2 : len2 - 1;
246 for (j = 1; j <= tmp_end; ++j) {
247 q = dpcell[j]; s = curr; SET_INF(*s);
248 set_end_I(s->I, q, last);
249 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
250 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
252 for (i = 1; i != end; ++i, ++s, ++q) {
253 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
254 set_I(s->I, q, last + i);
255 set_D(s->D, q, s - 1);
257 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
258 set_D(s->D, q, s - 1);
259 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
260 set_end_I(s->I, q, last + i);
261 } else s->I = MINOR_INF;
262 s = curr; curr = last; last = s;
264 /* last row for part 1, use set_end_D() instead of set_D() */
265 if (j == len2 && b2 != len2 - 1) {
266 q = dpcell[j]; s = curr; SET_INF(*s);
267 set_end_I(s->I, q, last);
268 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
269 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
271 for (i = 1; i != end; ++i, ++s, ++q) {
272 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
273 set_I(s->I, q, last + i);
274 set_end_D(s->D, q, s - 1);
276 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
277 set_end_D(s->D, q, s - 1);
278 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
279 set_end_I(s->I, q, last + i);
280 } else s->I = MINOR_INF;
281 s = curr; curr = last; last = s;
285 /* core dynamic programming, part 2 */
286 for (; j <= len2 - b2 + 1; ++j) {
287 SET_INF(curr[j - b2]);
288 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
290 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
291 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
292 set_I(s->I, q, last + i);
293 set_D(s->D, q, s - 1);
295 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
296 set_D(s->D, q, s - 1);
298 s = curr; curr = last; last = s;
301 /* core dynamic programming, part 3 */
302 for (; j < len2; ++j) {
303 SET_INF(curr[j - b2]);
304 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
305 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
306 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
307 set_I(s->I, q, last + i);
308 set_D(s->D, q, s - 1);
310 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
311 set_end_I(s->I, q, last + i);
312 set_D(s->D, q, s - 1);
313 s = curr; curr = last; last = s;
317 SET_INF(curr[j - b2]);
318 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
319 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
320 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
321 set_I(s->I, q, last + i);
322 set_end_D(s->D, q, s - 1);
324 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
325 set_end_I(s->I, q, last + i);
326 set_end_D(s->D, q, s - 1);
327 s = curr; curr = last; last = s;
330 *_score = last[len1].M;
331 if (n_cigar) { /* backtrace */
332 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
336 max = s->M; type = q->Mt; ctype = FROM_M;
337 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
338 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
341 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
345 case FROM_M: --i; --j; break;
346 case FROM_I: --j; break;
347 case FROM_D: --i; break;
352 case FROM_M: type = q->Mt; break;
353 case FROM_I: type = q->It; break;
354 case FROM_D: type = q->Dt; break;
356 p->ctype = ctype; p->i = i; p->j = j;
359 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
364 for (j = b2 + 1; j <= len2; ++j)
366 for (j = 0; j <= len2; ++j)
369 free(curr); free(last);
374 /************************
375 * Probabilistic glocal *
376 ************************/
378 static float g_qual2prob[256];
380 #define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
382 ka_probpar_t ka_probpar_def = { 0.001, 0.1, 10 };
385 The topology of the profile HMM:
388 I[1] I[k-1] I[k] I[L]
391 M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
397 M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
399 int ka_prob_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
400 const ka_probpar_t *c, int *state, uint8_t *q)
402 double **f, **b, *s, m[9], sI, sM, bI, bM, pb;
404 const uint8_t *ref, *query;
405 int bw, bw2, i, k, is_diff = 0;
407 /*** initialization ***/
408 ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
409 bw = l_ref > l_query? l_ref : l_query;
410 if (bw > c->bw) bw = c->bw;
411 if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
413 // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
414 f = calloc(l_query+1, sizeof(void*));
415 b = calloc(l_query+1, sizeof(void*));
416 for (i = 0; i <= l_query; ++i) {
417 f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
418 b[i] = calloc(bw2 * 3 + 6, sizeof(double));
420 s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
422 _qual = calloc(l_query, sizeof(float));
423 if (g_qual2prob[0] == 0)
424 for (i = 0; i < 256; ++i)
425 g_qual2prob[i] = pow(10, -i/10.);
426 for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
428 // initialize transition probability
429 sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
430 m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
431 m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
432 m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
433 bM = (1 - c->d) / l_query; bI = c->d / l_query; // (bM+bI)*l_query==1
439 double *fi = f[1], sum;
440 int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
441 for (k = beg, sum = 0.; k <= end; ++k) {
443 double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
445 fi[u+0] = e * bM; fi[u+1] = .25 * bI;
446 sum += fi[u] + fi[u+1];
450 set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
451 for (k = _beg; k <= _end; ++k) fi[k] /= sum;
454 for (i = 2; i <= l_query; ++i) {
455 double *fi = f[i], *fi1 = f[i-1], sum;
456 int beg = 1, end = l_ref, x, _beg, _end;
457 x = i - bw; beg = beg > x? beg : x; // band start
458 x = i + bw; end = end < x? end : x; // band end
459 for (k = beg, sum = 0.; k <= end; ++k) {
460 int u, v11, v01, v10;
462 // FIXME: the following line can be optimized without branching
463 e = (ref[k] > 3 || query[i] > 3)? 1. : ref[k] == query[i]? 1. - qual[i] : qual[i] / 3.;
464 set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
465 fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
466 fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
467 fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
468 sum += fi[u] + fi[u+1] + fi[u+2];
469 // fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
473 set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
474 for (k = _beg; k <= _end; ++k) fi[k] /= sum;
478 for (k = 1, sum = 0.; k <= l_ref; ++k) {
480 set_u(u, bw, l_query, k);
481 if (u < 3 || u >= bw2*3+3) continue;
482 sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
484 s[l_query+1] = sum; // the last scaling factor
487 // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
488 for (k = 1; k <= l_ref; ++k) {
490 double *bi = b[l_query];
491 set_u(u, bw, l_query, k);
492 if (u < 3 || u >= bw2*3+3) continue;
493 bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
496 for (i = l_query - 1; i >= 1; --i) {
497 int beg = 1, end = l_ref, x, _beg, _end;
498 double *bi = b[i], *bi1 = b[i+1];
499 x = i - bw; beg = beg > x? beg : x;
500 x = i + bw; end = end < x? end : x;
501 for (k = end; k >= beg; --k) {
502 int u, v11, v01, v10;
504 // FIXME: the following can be optimized without branching
505 e = k >= l_ref? 0 : (ref[k+1] > 3 || query[i+1] > 3)? 1. : ref[k+1] == query[i+1]? 1. - qual[i+1] : qual[i+1] / 3.;
506 set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
507 bi[u+0] = e * m[0] * bi1[v11+0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2];
508 bi[u+1] = e * m[3] * bi1[v11+0] + .25 * m[4] * bi1[v10+1];
509 // FIXME: I do not know why I need this (i>1) factor, but only with it the result makes sense.
510 bi[u+2] = (e * m[6] * bi1[v11+0] + m[8] * bi[v01+2]) * (i > 1);
511 // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
514 set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
515 for (k = _beg; k <= _end; ++k) bi[k] /= s[i];
518 int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
520 for (k = end; k >= beg; --k) {
522 double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
524 if (u < 3 || u >= bw2*3+3) continue;
525 sum += e * b[1][u+0] * bM + .25 * b[1][u+1] * bI;
528 pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
530 is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
532 for (i = 1; i <= l_query; ++i) {
533 double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
534 int beg = 0, end = l_ref, x, max_k = -1;
535 x = i - bw; beg = beg > x? beg : x;
536 x = i + bw; end = end < x? end : x;
537 for (k = beg; k <= end; ++k) {
541 z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = k<<2 | 0; sum += z;
542 z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = k<<2 | 1; sum += z;
544 max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
545 if (state) state[i-1] = max_k;
546 if (q) q[i-1] = -4.343 * log(1. - max);
548 fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%d)~%lg\n", pb, sum, i, max_k>>2, max_k&3, max); // DEBUG
552 for (i = 0; i <= l_query; ++i) {
553 free(f[i]); free(b[i]);
555 free(f); free(b); free(s); free(_qual);
562 int l_ref = 5, l_query = 4;
563 uint8_t *ref = (uint8_t*)"\0\1\3\3\1";
564 uint8_t *query = (uint8_t*)"\0\3\3\1";
565 // uint8_t *query = (uint8_t*)"\1\3\3\1";
566 static uint8_t qual[4] = {20, 20, 20, 20};
567 ka_prob_glocal(ref, l_ref, query, l_query, qual, &ka_probpar_def, 0, 0);