]> git.donarmstrong.com Git - samtools.git/blob - kaln.c
* samtools-0.1.6-12 (r478)
[samtools.git] / kaln.c
1 /* The MIT License
2
3    Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdint.h>
30 #include "kaln.h"
31
32 #define FROM_M 0
33 #define FROM_I 1
34 #define FROM_D 2
35
36 typedef struct {
37         int i, j;
38         unsigned char ctype;
39 } path_t;
40
41 int aln_sm_blosum62[] = {
42 /*       A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
43          4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44         -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45         -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46         -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47          0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48         -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49         -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50          0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51         -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52         -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53         -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54         -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55         -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56         -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57         -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58          1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59          0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60         -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61         -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62          0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63         -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64          0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
65 };
66
67 int aln_sm_blast[] = {
68         1, -3, -3, -3, -2,
69         -3, 1, -3, -3, -2,
70         -3, -3, 1, -3, -2,
71         -3, -3, -3, 1, -2,
72         -2, -2, -2, -2, -2
73 };
74
75 ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
77
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
79 {
80         int i, n;
81         uint32_t *cigar;
82         unsigned char last_type;
83
84         if (path_len == 0 || path == 0) {
85                 *n_cigar = 0;
86                 return 0;
87         }
88
89         last_type = path->ctype;
90         for (i = n = 1; i < path_len; ++i) {
91                 if (last_type != path[i].ctype) ++n;
92                 last_type = path[i].ctype;
93         }
94         *n_cigar = n;
95         cigar = (uint32_t*)calloc(*n_cigar, 4);
96
97         cigar[0] = 1u << 4 | path[path_len-1].ctype;
98         last_type = path[path_len-1].ctype;
99         for (i = path_len - 2, n = 0; i >= 0; --i) {
100                 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
101                 else {
102                         cigar[++n] = 1u << 4 | path[i].ctype;
103                         last_type = path[i].ctype;
104                 }
105         }
106
107         return cigar;
108 }
109
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
113
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
115
116 #define set_M(MM, cur, p, sc)                                                   \
117 {                                                                                                               \
118         if ((p)->M >= (p)->I) {                                                         \
119                 if ((p)->M >= (p)->D) {                                                 \
120                         (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
121                 } else {                                                                                \
122                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
123                 }                                                                                               \
124         } else {                                                                                        \
125                 if ((p)->I > (p)->D) {                                                  \
126                         (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
127                 } else {                                                                                \
128                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
129                 }                                                                                               \
130         }                                                                                                       \
131 }
132 #define set_I(II, cur, p)                                                               \
133 {                                                                                                               \
134         if ((p)->M - gap_open > (p)->I) {                                       \
135                 (cur)->It = FROM_M;                                                             \
136                 (II) = (p)->M - gap_open - gap_ext;                             \
137         } else {                                                                                        \
138                 (cur)->It = FROM_I;                                                             \
139                 (II) = (p)->I - gap_ext;                                                \
140         }                                                                                                       \
141 }
142 #define set_end_I(II, cur, p)                                                   \
143 {                                                                                                               \
144         if (gap_end >= 0) {                                                                     \
145                 if ((p)->M - gap_open > (p)->I) {                               \
146                         (cur)->It = FROM_M;                                                     \
147                         (II) = (p)->M - gap_open - gap_end;                     \
148                 } else {                                                                                \
149                         (cur)->It = FROM_I;                                                     \
150                         (II) = (p)->I - gap_end;                                        \
151                 }                                                                                               \
152         } else set_I(II, cur, p);                                                       \
153 }
154 #define set_D(DD, cur, p)                                                               \
155 {                                                                                                               \
156         if ((p)->M - gap_open > (p)->D) {                                       \
157                 (cur)->Dt = FROM_M;                                                             \
158                 (DD) = (p)->M - gap_open - gap_ext;                             \
159         } else {                                                                                        \
160                 (cur)->Dt = FROM_D;                                                             \
161                 (DD) = (p)->D - gap_ext;                                                \
162         }                                                                                                       \
163 }
164 #define set_end_D(DD, cur, p)                                                   \
165 {                                                                                                               \
166         if (gap_end >= 0) {                                                                     \
167                 if ((p)->M - gap_open > (p)->D) {                               \
168                         (cur)->Dt = FROM_M;                                                     \
169                         (DD) = (p)->M - gap_open - gap_end;                     \
170                 } else {                                                                                \
171                         (cur)->Dt = FROM_D;                                                     \
172                         (DD) = (p)->D - gap_end;                                        \
173                 }                                                                                               \
174         } else set_D(DD, cur, p);                                                       \
175 }
176
177 typedef struct {
178         uint8_t Mt:3, It:2, Dt:2;
179 } dpcell_t;
180
181 typedef struct {
182         int M, I, D;
183 } dpscore_t;
184
185 /* build score profile for accelerating alignment, in theory */
186 static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
187 {
188         int *tmp, *tmp2, i, k;
189         for (i = 0; i != row; ++i) {
190                 tmp = score_matrix + i * row;
191                 tmp2 = s_array[i];
192                 for (k = 0; k != len; ++k)
193                         tmp2[k] = tmp[seq[k]];
194         }
195 }
196 /***************************
197  * banded global alignment *
198  ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
200 {
201         int i, j;
202         dpcell_t **dpcell, *q;
203         dpscore_t *curr, *last, *s;
204         int b1, b2, tmp_end;
205         int *mat, end, max = 0;
206         uint8_t type, ctype;
207         uint32_t *cigar = 0;
208
209         int gap_open, gap_ext, gap_end, b;
210         int *score_matrix, N_MATRIX_ROW;
211
212         /* initialize some align-related parameters. just for compatibility */
213         gap_open = ap->gap_open;
214         gap_ext = ap->gap_ext;
215         gap_end = ap->gap_end;
216         b = ap->band_width;
217         score_matrix = ap->matrix;
218         N_MATRIX_ROW = ap->row;
219
220         *n_cigar = 0;
221         if (len1 == 0 || len2 == 0) return 0;
222
223         /* calculate b1 and b2 */
224         if (len1 > len2) {
225                 b1 = len1 - len2 + b;
226                 b2 = b;
227         } else {
228                 b1 = b;
229                 b2 = len2 - len1 + b;
230         }
231         if (b1 > len1) b1 = len1;
232         if (b2 > len2) b2 = len2;
233         --seq1; --seq2;
234
235         /* allocate memory */
236         end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
237         dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
238         for (j = 0; j <= len2; ++j)
239                 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
240         for (j = b2 + 1; j <= len2; ++j)
241                 dpcell[j] -= j - b2;
242         curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
243         last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
244         
245         /* set first row */
246         SET_INF(*curr); curr->M = 0;
247         for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
248                 SET_INF(*s);
249                 set_end_D(s->D, dpcell[0] + i, s - 1);
250         }
251         s = curr; curr = last; last = s;
252
253         /* core dynamic programming, part 1 */
254         tmp_end = (b2 < len2)? b2 : len2 - 1;
255         for (j = 1; j <= tmp_end; ++j) {
256                 q = dpcell[j]; s = curr; SET_INF(*s);
257                 set_end_I(s->I, q, last);
258                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
259                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
260                 ++s; ++q;
261                 for (i = 1; i != end; ++i, ++s, ++q) {
262                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
263                         set_I(s->I, q, last + i);
264                         set_D(s->D, q, s - 1);
265                 }
266                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
267                 set_D(s->D, q, s - 1);
268                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
269                         set_end_I(s->I, q, last + i);
270                 } else s->I = MINOR_INF;
271                 s = curr; curr = last; last = s;
272         }
273         /* last row for part 1, use set_end_D() instead of set_D() */
274         if (j == len2 && b2 != len2 - 1) {
275                 q = dpcell[j]; s = curr; SET_INF(*s);
276                 set_end_I(s->I, q, last);
277                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
278                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
279                 ++s; ++q;
280                 for (i = 1; i != end; ++i, ++s, ++q) {
281                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
282                         set_I(s->I, q, last + i);
283                         set_end_D(s->D, q, s - 1);
284                 }
285                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
286                 set_end_D(s->D, q, s - 1);
287                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
288                         set_end_I(s->I, q, last + i);
289                 } else s->I = MINOR_INF;
290                 s = curr; curr = last; last = s;
291                 ++j;
292         }
293
294         /* core dynamic programming, part 2 */
295         for (; j <= len2 - b2 + 1; ++j) {
296                 SET_INF(curr[j - b2]);
297                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
298                 end = j + b1 - 1;
299                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
300                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
301                         set_I(s->I, q, last + i);
302                         set_D(s->D, q, s - 1);
303                 }
304                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
305                 set_D(s->D, q, s - 1);
306                 s->I = MINOR_INF;
307                 s = curr; curr = last; last = s;
308         }
309
310         /* core dynamic programming, part 3 */
311         for (; j < len2; ++j) {
312                 SET_INF(curr[j - b2]);
313                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
314                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
315                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
316                         set_I(s->I, q, last + i);
317                         set_D(s->D, q, s - 1);
318                 }
319                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
320                 set_end_I(s->I, q, last + i);
321                 set_D(s->D, q, s - 1);
322                 s = curr; curr = last; last = s;
323         }
324         /* last row */
325         if (j == len2) {
326                 SET_INF(curr[j - b2]);
327                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
328                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
329                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
330                         set_I(s->I, q, last + i);
331                         set_end_D(s->D, q, s - 1);
332                 }
333                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
334                 set_end_I(s->I, q, last + i);
335                 set_end_D(s->D, q, s - 1);
336                 s = curr; curr = last; last = s;
337         }
338
339         *_score = last[len1].M;
340         if (n_cigar) { /* backtrace */
341                 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
342                 i = len1; j = len2;
343                 q = dpcell[j] + i;
344                 s = last + len1;
345                 max = s->M; type = q->Mt; ctype = FROM_M;
346                 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
347                 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
348
349                 p = path;
350                 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
351                 ++p;
352                 do {
353                         switch (ctype) {
354                         case FROM_M: --i; --j; break;
355                         case FROM_I: --j; break;
356                         case FROM_D: --i; break;
357                         }
358                         q = dpcell[j] + i;
359                         ctype = type;
360                         switch (type) {
361                         case FROM_M: type = q->Mt; break;
362                         case FROM_I: type = q->It; break;
363                         case FROM_D: type = q->Dt; break;
364                         }
365                         p->ctype = ctype; p->i = i; p->j = j;
366                         ++p;
367                 } while (i || j);
368                 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
369                 free(path);
370         }
371
372         /* free memory */
373         for (j = b2 + 1; j <= len2; ++j)
374                 dpcell[j] += j - b2;
375         for (j = 0; j <= len2; ++j)
376                 free(dpcell[j]);
377         free(dpcell);
378         free(curr); free(last);
379
380         return cigar;
381 }