]> git.donarmstrong.com Git - samtools.git/blob - kaln.c
Unfinished modification. Please do not use this revision...
[samtools.git] / kaln.c
1 /* The MIT License
2
3    Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdint.h>
30 #include "kaln.h"
31
32 #define FROM_M 0
33 #define FROM_I 1
34 #define FROM_D 2
35
36 typedef struct {
37         int i, j;
38         unsigned char ctype;
39 } path_t;
40
41 int aln_sm_blosum62[] = {
42 /*       A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
43          4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44         -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45         -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46         -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47          0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48         -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49         -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50          0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51         -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52         -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53         -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54         -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55         -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56         -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57         -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58          1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59          0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60         -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61         -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62          0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63         -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64          0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
65 };
66
67 int aln_sm_blast[] = {
68         1, -3, -3, -3, -2,
69         -3, 1, -3, -3, -2,
70         -3, -3, 1, -3, -2,
71         -3, -3, -3, 1, -2,
72         -2, -2, -2, -2, -2
73 };
74
75 ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
77
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
79 {
80         int i, n;
81         uint32_t *cigar;
82         unsigned char last_type;
83
84         if (path_len == 0 || path == 0) {
85                 *n_cigar = 0;
86                 return 0;
87         }
88
89         last_type = path->ctype;
90         for (i = n = 1; i < path_len; ++i) {
91                 if (last_type != path[i].ctype) ++n;
92                 last_type = path[i].ctype;
93         }
94         *n_cigar = n;
95         cigar = (uint32_t*)calloc(*n_cigar, 4);
96
97         cigar[0] = 1u << 4 | path[path_len-1].ctype;
98         last_type = path[path_len-1].ctype;
99         for (i = path_len - 2, n = 0; i >= 0; --i) {
100                 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
101                 else {
102                         cigar[++n] = 1u << 4 | path[i].ctype;
103                         last_type = path[i].ctype;
104                 }
105         }
106
107         return cigar;
108 }
109
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
113
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
115
116 #define set_M(MM, cur, p, sc)                                                   \
117 {                                                                                                               \
118         if ((p)->M >= (p)->I) {                                                         \
119                 if ((p)->M >= (p)->D) {                                                 \
120                         (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
121                 } else {                                                                                \
122                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
123                 }                                                                                               \
124         } else {                                                                                        \
125                 if ((p)->I > (p)->D) {                                                  \
126                         (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
127                 } else {                                                                                \
128                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
129                 }                                                                                               \
130         }                                                                                                       \
131 }
132 #define set_I(II, cur, p)                                                               \
133 {                                                                                                               \
134         if ((p)->M - gap_open > (p)->I) {                                       \
135                 (cur)->It = FROM_M;                                                             \
136                 (II) = (p)->M - gap_open - gap_ext;                             \
137         } else {                                                                                        \
138                 (cur)->It = FROM_I;                                                             \
139                 (II) = (p)->I - gap_ext;                                                \
140         }                                                                                                       \
141 }
142 #define set_end_I(II, cur, p)                                                   \
143 {                                                                                                               \
144         if (gap_end >= 0) {                                                                     \
145                 if ((p)->M - gap_open > (p)->I) {                               \
146                         (cur)->It = FROM_M;                                                     \
147                         (II) = (p)->M - gap_open - gap_end;                     \
148                 } else {                                                                                \
149                         (cur)->It = FROM_I;                                                     \
150                         (II) = (p)->I - gap_end;                                        \
151                 }                                                                                               \
152         } else set_I(II, cur, p);                                                       \
153 }
154 #define set_D(DD, cur, p)                                                               \
155 {                                                                                                               \
156         if ((p)->M - gap_open > (p)->D) {                                       \
157                 (cur)->Dt = FROM_M;                                                             \
158                 (DD) = (p)->M - gap_open - gap_ext;                             \
159         } else {                                                                                        \
160                 (cur)->Dt = FROM_D;                                                             \
161                 (DD) = (p)->D - gap_ext;                                                \
162         }                                                                                                       \
163 }
164 #define set_end_D(DD, cur, p)                                                   \
165 {                                                                                                               \
166         if (gap_end >= 0) {                                                                     \
167                 if ((p)->M - gap_open > (p)->D) {                               \
168                         (cur)->Dt = FROM_M;                                                     \
169                         (DD) = (p)->M - gap_open - gap_end;                     \
170                 } else {                                                                                \
171                         (cur)->Dt = FROM_D;                                                     \
172                         (DD) = (p)->D - gap_end;                                        \
173                 }                                                                                               \
174         } else set_D(DD, cur, p);                                                       \
175 }
176
177 typedef struct {
178         uint8_t Mt:3, It:2, Dt:2;
179 } dpcell_t;
180
181 typedef struct {
182         int M, I, D;
183 } dpscore_t;
184
185 /* build score profile for accelerating alignment, in theory */
186 static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
187 {
188         int *tmp, *tmp2, i, k;
189         for (i = 0; i != row; ++i) {
190                 tmp = score_matrix + i * row;
191                 tmp2 = s_array[i];
192                 for (k = 0; k != len; ++k)
193                         tmp2[k] = tmp[seq[k]];
194         }
195 }
196 /***************************
197  * banded global alignment *
198  ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
200 {
201         int i, j;
202         dpcell_t **dpcell, *q;
203         dpscore_t *curr, *last, *s;
204         int b1, b2, tmp_end;
205         int *mat, end, max = 0;
206         uint8_t type, ctype;
207         uint32_t *cigar = 0;
208
209         int gap_open, gap_ext, gap_end, b;
210         int *score_matrix, N_MATRIX_ROW;
211
212         /* initialize some align-related parameters. just for compatibility */
213         gap_open = ap->gap_open;
214         gap_ext = ap->gap_ext;
215         gap_end = ap->gap_end;
216         b = ap->band_width;
217         score_matrix = ap->matrix;
218         N_MATRIX_ROW = ap->row;
219         
220         if (len1 == 0 || len2 == 0) return 0;
221
222         /* calculate b1 and b2 */
223         if (len1 > len2) {
224                 b1 = len1 - len2 + b;
225                 b2 = b;
226         } else {
227                 b1 = b;
228                 b2 = len2 - len1 + b;
229         }
230         if (b1 > len1) b1 = len1;
231         if (b2 > len2) b2 = len2;
232         --seq1; --seq2;
233
234         /* allocate memory */
235         end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
236         dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
237         for (j = 0; j <= len2; ++j)
238                 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
239         for (j = b2 + 1; j <= len2; ++j)
240                 dpcell[j] -= j - b2;
241         curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
242         last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
243         
244         /* set first row */
245         SET_INF(*curr); curr->M = 0;
246         for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
247                 SET_INF(*s);
248                 set_end_D(s->D, dpcell[0] + i, s - 1);
249         }
250         s = curr; curr = last; last = s;
251
252         /* core dynamic programming, part 1 */
253         tmp_end = (b2 < len2)? b2 : len2 - 1;
254         for (j = 1; j <= tmp_end; ++j) {
255                 q = dpcell[j]; s = curr; SET_INF(*s);
256                 set_end_I(s->I, q, last);
257                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
258                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
259                 ++s; ++q;
260                 for (i = 1; i != end; ++i, ++s, ++q) {
261                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
262                         set_I(s->I, q, last + i);
263                         set_D(s->D, q, s - 1);
264                 }
265                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
266                 set_D(s->D, q, s - 1);
267                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
268                         set_end_I(s->I, q, last + i);
269                 } else s->I = MINOR_INF;
270                 s = curr; curr = last; last = s;
271         }
272         /* last row for part 1, use set_end_D() instead of set_D() */
273         if (j == len2 && b2 != len2 - 1) {
274                 q = dpcell[j]; s = curr; SET_INF(*s);
275                 set_end_I(s->I, q, last);
276                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
277                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
278                 ++s; ++q;
279                 for (i = 1; i != end; ++i, ++s, ++q) {
280                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
281                         set_I(s->I, q, last + i);
282                         set_end_D(s->D, q, s - 1);
283                 }
284                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
285                 set_end_D(s->D, q, s - 1);
286                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
287                         set_end_I(s->I, q, last + i);
288                 } else s->I = MINOR_INF;
289                 s = curr; curr = last; last = s;
290                 ++j;
291         }
292
293         /* core dynamic programming, part 2 */
294         for (; j <= len2 - b2 + 1; ++j) {
295                 SET_INF(curr[j - b2]);
296                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
297                 end = j + b1 - 1;
298                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
299                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
300                         set_I(s->I, q, last + i);
301                         set_D(s->D, q, s - 1);
302                 }
303                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
304                 set_D(s->D, q, s - 1);
305                 s->I = MINOR_INF;
306                 s = curr; curr = last; last = s;
307         }
308
309         /* core dynamic programming, part 3 */
310         for (; j < len2; ++j) {
311                 SET_INF(curr[j - b2]);
312                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
313                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
314                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
315                         set_I(s->I, q, last + i);
316                         set_D(s->D, q, s - 1);
317                 }
318                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
319                 set_end_I(s->I, q, last + i);
320                 set_D(s->D, q, s - 1);
321                 s = curr; curr = last; last = s;
322         }
323         /* last row */
324         if (j == len2) {
325                 SET_INF(curr[j - b2]);
326                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
327                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
328                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
329                         set_I(s->I, q, last + i);
330                         set_end_D(s->D, q, s - 1);
331                 }
332                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
333                 set_end_I(s->I, q, last + i);
334                 set_end_D(s->D, q, s - 1);
335                 s = curr; curr = last; last = s;
336         }
337
338         *_score = last[len1].M;
339         if (n_cigar) { /* backtrace */
340                 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
341                 i = len1; j = len2;
342                 q = dpcell[j] + i;
343                 s = last + len1;
344                 max = s->M; type = q->Mt; ctype = FROM_M;
345                 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
346                 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
347
348                 p = path;
349                 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
350                 ++p;
351                 do {
352                         switch (ctype) {
353                         case FROM_M: --i; --j; break;
354                         case FROM_I: --j; break;
355                         case FROM_D: --i; break;
356                         }
357                         q = dpcell[j] + i;
358                         ctype = type;
359                         switch (type) {
360                         case FROM_M: type = q->Mt; break;
361                         case FROM_I: type = q->It; break;
362                         case FROM_D: type = q->Dt; break;
363                         }
364                         p->ctype = ctype; p->i = i; p->j = j;
365                         ++p;
366                 } while (i || j);
367                 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
368                 free(path);
369         }
370
371         /* free memory */
372         for (j = b2 + 1; j <= len2; ++j)
373                 dpcell[j] += j - b2;
374         for (j = 0; j <= len2; ++j)
375                 free(dpcell[j]);
376         free(dpcell);
377         free(curr); free(last);
378
379         return cigar;
380 }