6 #include "bam_endian.h"
12 Alignment indexing. Before indexing, BAM must be sorted based on the
13 leftmost coordinate of alignments. In indexing, BAM uses two indices:
14 a UCSC binning index and a simple linear index. The binning index is
15 efficient for alignments spanning long distance, while the auxiliary
16 linear index helps to reduce unnecessary seek calls especially for
19 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
20 Stein and is explained by Kent et al. (2002). In this scheme, each bin
21 represents a contiguous genomic region which can be fully contained in
22 another bin; each alignment is associated with a bin which represents
23 the smallest region containing the entire alignment. The binning
24 scheme is essentially another representation of R-tree. A distinct bin
25 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
26 a child of Bin B if region A is contained in B.
28 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
29 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
30 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
31 find the alignments overlapped with a region [rbeg,rend), we need to
32 calculate the list of bins that may be overlapped the region and test
33 the alignments in the bins to confirm the overlaps. If the specified
34 region is short, typically only a few alignments in six bins need to
35 be retrieved. The overlapping alignments can be quickly fetched.
39 #define BAM_MIN_CHUNK_GAP 32768
40 // 1<<14 is the size of minimum bin.
41 #define BAM_LIDX_SHIFT 14
47 #define pair64_lt(a,b) ((a).u < (b).u)
48 KSORT_INIT(off, pair64_t, pair64_lt)
60 KHASH_MAP_INIT_INT(i, bam_binlist_t)
62 struct __bam_index_t {
68 // requirement: len <= LEN_MASK
69 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
74 k = kh_put(i, h, bin, &ret);
76 if (ret) { // not present
78 l->list = (pair64_t*)calloc(l->m, 16);
82 l->list = (pair64_t*)realloc(l->list, l->m * 16);
84 l->list[l->n].u = beg; l->list[l->n++].v = end;
87 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
90 beg = b->core.pos >> BAM_LIDX_SHIFT;
91 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
92 if (index2->m < end + 1) {
93 int old_m = index2->m;
95 kroundup32(index2->m);
96 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
97 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
99 for (i = beg + 1; i <= end; ++i)
100 if (index2->offset[i] == 0) index2->offset[i] = offset;
104 static void merge_chunks(bam_index_t *idx)
106 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
110 for (i = 0; i < idx->n; ++i) {
111 index = idx->index[i];
112 for (k = kh_begin(index); k != kh_end(index); ++k) {
114 if (!kh_exist(index, k)) continue;
115 p = &kh_value(index, k);
117 for (l = 1; l < p->n; ++l) {
118 #ifdef BAM_TRUE_OFFSET
119 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
121 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
123 else p->list[++m] = p->list[l];
128 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
131 bam_index_t *bam_index_core(bamFile fp)
137 uint32_t last_bin, save_bin;
138 int32_t last_coor, last_tid, save_tid;
140 uint64_t save_off, last_off;
142 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
143 b = (bam1_t*)calloc(1, sizeof(bam1_t));
144 h = bam_header_read(fp);
147 idx->n = h->n_targets;
148 bam_header_destroy(h);
149 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
150 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
151 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
153 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
154 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
155 while ((ret = bam_read1(fp, b)) >= 0) {
156 if (last_tid != c->tid) { // change of chromosomes
158 last_bin = 0xffffffffu;
159 } else if (last_coor > c->pos) {
160 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
161 bam1_qname(b), last_coor, c->pos, c->tid+1);
164 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
165 if (c->bin != last_bin) { // then possibly write the binning index
166 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
167 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
169 save_bin = last_bin = c->bin;
171 if (save_tid < 0) break;
173 if (bam_tell(fp) <= last_off) {
174 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
175 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
178 last_off = bam_tell(fp);
179 last_coor = b->core.pos;
181 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
183 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
184 free(b->data); free(b);
188 void bam_index_destroy(bam_index_t *idx)
192 if (idx == 0) return;
193 for (i = 0; i < idx->n; ++i) {
194 khash_t(i) *index = idx->index[i];
195 bam_lidx_t *index2 = idx->index2 + i;
196 for (k = kh_begin(index); k != kh_end(index); ++k) {
197 if (kh_exist(index, k))
198 free(kh_value(index, k).list);
200 kh_destroy(i, index);
201 free(index2->offset);
203 free(idx->index); free(idx->index2);
207 void bam_index_save(const bam_index_t *idx, FILE *fp)
211 fwrite("BAI\1", 1, 4, fp);
214 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
215 } else fwrite(&idx->n, 4, 1, fp);
216 for (i = 0; i < idx->n; ++i) {
217 khash_t(i) *index = idx->index[i];
218 bam_lidx_t *index2 = idx->index2 + i;
219 // write binning index
220 size = kh_size(index);
221 if (bam_is_be) { // big endian
223 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
224 } else fwrite(&size, 4, 1, fp);
225 for (k = kh_begin(index); k != kh_end(index); ++k) {
226 if (kh_exist(index, k)) {
227 bam_binlist_t *p = &kh_value(index, k);
228 if (bam_is_be) { // big endian
230 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
231 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
232 for (x = 0; (int)x < p->n; ++x) {
233 bam_swap_endian_8p(&p->list[x].u);
234 bam_swap_endian_8p(&p->list[x].v);
236 fwrite(p->list, 16, p->n, fp);
237 for (x = 0; (int)x < p->n; ++x) {
238 bam_swap_endian_8p(&p->list[x].u);
239 bam_swap_endian_8p(&p->list[x].v);
242 fwrite(&kh_key(index, k), 4, 1, fp);
243 fwrite(&p->n, 4, 1, fp);
244 fwrite(p->list, 16, p->n, fp);
248 // write linear index (index2)
251 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
252 } else fwrite(&index2->n, 4, 1, fp);
253 if (bam_is_be) { // big endian
255 for (x = 0; (int)x < index2->n; ++x)
256 bam_swap_endian_8p(&index2->offset[x]);
257 fwrite(index2->offset, 8, index2->n, fp);
258 for (x = 0; (int)x < index2->n; ++x)
259 bam_swap_endian_8p(&index2->offset[x]);
260 } else fwrite(index2->offset, 8, index2->n, fp);
265 static bam_index_t *bam_index_load_core(FILE *fp)
271 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
274 fread(magic, 1, 4, fp);
275 if (strncmp(magic, "BAI\1", 4)) {
276 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
280 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
281 fread(&idx->n, 4, 1, fp);
282 if (bam_is_be) bam_swap_endian_4p(&idx->n);
283 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
284 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
285 for (i = 0; i < idx->n; ++i) {
287 bam_lidx_t *index2 = idx->index2 + i;
292 index = idx->index[i] = kh_init(i);
293 // load binning index
294 fread(&size, 4, 1, fp);
295 if (bam_is_be) bam_swap_endian_4p(&size);
296 for (j = 0; j < (int)size; ++j) {
297 fread(&key, 4, 1, fp);
298 if (bam_is_be) bam_swap_endian_4p(&key);
299 k = kh_put(i, index, key, &ret);
300 p = &kh_value(index, k);
301 fread(&p->n, 4, 1, fp);
302 if (bam_is_be) bam_swap_endian_4p(&p->n);
304 p->list = (pair64_t*)malloc(p->m * 16);
305 fread(p->list, 16, p->n, fp);
308 for (x = 0; x < p->n; ++x) {
309 bam_swap_endian_8p(&p->list[x].u);
310 bam_swap_endian_8p(&p->list[x].v);
315 fread(&index2->n, 4, 1, fp);
316 if (bam_is_be) bam_swap_endian_4p(&index2->n);
317 index2->m = index2->n;
318 index2->offset = (uint64_t*)calloc(index2->m, 8);
319 fread(index2->offset, index2->n, 8, fp);
321 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
326 bam_index_t *bam_index_load_local(const char *_fn)
331 if (strstr(_fn, "ftp://") == _fn) {
334 for (p = _fn + l - 1; p >= _fn; --p)
335 if (*p == '/') break;
337 } else fn = strdup(_fn);
338 fnidx = (char*)calloc(strlen(fn) + 5, 1);
339 strcpy(fnidx, fn); strcat(fnidx, ".bai");
340 fp = fopen(fnidx, "r");
341 if (fp == 0) { // try "{base}.bai"
342 char *s = strstr(fn, "bam");
343 if (s == fn + strlen(fn) - 3) {
345 fnidx[strlen(fn)-1] = 'i';
346 fp = fopen(fnidx, "r");
349 free(fnidx); free(fn);
351 bam_index_t *idx = bam_index_load_core(fp);
357 static void download_from_remote(const char *url)
359 const int buf_size = 1 * 1024 * 1024;
365 if (strstr(url, "ftp://") != url) return;
367 for (fn = (char*)url + l - 1; fn >= url; --fn)
368 if (*fn == '/') break;
369 ++fn; // fn now points to the file name
370 fp_remote = knet_open(url, "r");
371 if (fp_remote == 0) {
372 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
375 if ((fp = fopen(fn, "w")) == 0) {
376 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
377 knet_close(fp_remote);
380 buf = (uint8_t*)calloc(buf_size, 1);
381 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
382 fwrite(buf, 1, l, fp);
385 knet_close(fp_remote);
388 bam_index_t *bam_index_load(const char *fn)
391 idx = bam_index_load_local(fn);
392 if (idx == 0 && strstr(fn, "ftp://") == fn) {
393 char *fnidx = calloc(strlen(fn) + 5, 1);
394 strcat(strcpy(fnidx, fn), ".bai");
395 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
396 download_from_remote(fnidx);
397 idx = bam_index_load_local(fn);
399 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
403 int bam_index_build2(const char *fn, const char *_fnidx)
409 if ((fp = bam_open(fn, "r")) == 0) {
410 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
413 idx = bam_index_core(fp);
416 fnidx = (char*)calloc(strlen(fn) + 5, 1);
417 strcpy(fnidx, fn); strcat(fnidx, ".bai");
418 } else fnidx = strdup(_fnidx);
419 fpidx = fopen(fnidx, "w");
421 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
425 bam_index_save(idx, fpidx);
426 bam_index_destroy(idx);
432 int bam_index_build(const char *fn)
434 return bam_index_build2(fn, 0);
437 int bam_index(int argc, char *argv[])
440 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
443 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
444 else bam_index_build(argv[1]);
448 #define MAX_BIN 37450 // =(8^6-1)/7+1
450 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
455 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
456 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
457 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
458 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
459 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
463 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
465 uint32_t rbeg = b->core.pos;
466 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
467 return (rend > beg && rbeg < end);
470 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
473 int i, n_bins, n_off;
479 bins = (uint16_t*)calloc(MAX_BIN, 2);
480 n_bins = reg2bins(beg, end, bins);
481 index = idx->index[tid];
482 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
483 for (i = n_off = 0; i < n_bins; ++i) {
484 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
485 n_off += kh_value(index, k).n;
488 free(bins); return 0;
490 off = (pair64_t*)calloc(n_off, 16);
491 for (i = n_off = 0; i < n_bins; ++i) {
492 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
494 bam_binlist_t *p = &kh_value(index, k);
495 for (j = 0; j < p->n; ++j)
496 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
504 b = (bam1_t*)calloc(1, sizeof(bam1_t));
505 ks_introsort(off, n_off, off);
506 // resolve completely contained adjacent blocks
507 for (i = 1, l = 0; i < n_off; ++i)
508 if (off[l].v < off[i].v)
511 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
512 for (i = 1; i < n_off; ++i)
513 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
514 { // merge adjacent blocks
515 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
516 for (i = 1, l = 0; i < n_off; ++i) {
517 #ifdef BAM_TRUE_OFFSET
518 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
520 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
522 else off[++l] = off[i];
527 // retrive alignments
528 n_seeks = 0; i = -1; curr_off = 0;
530 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
531 if (i == n_off - 1) break; // no more chunks
532 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
533 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
534 bam_seek(fp, off[i+1].u, SEEK_SET);
535 curr_off = bam_tell(fp);
540 if ((ret = bam_read1(fp, b)) > 0) {
541 curr_off = bam_tell(fp);
542 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
543 else if (is_overlap(beg, end, b)) func(b, data);
544 } else break; // end of file
546 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);