6 #include "bam_endian.h"
12 Alignment indexing. Before indexing, BAM must be sorted based on the
13 leftmost coordinate of alignments. In indexing, BAM uses two indices:
14 a UCSC binning index and a simple linear index. The binning index is
15 efficient for alignments spanning long distance, while the auxiliary
16 linear index helps to reduce unnecessary seek calls especially for
19 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
20 Stein and is explained by Kent et al. (2002). In this scheme, each bin
21 represents a contiguous genomic region which can be fully contained in
22 another bin; each alignment is associated with a bin which represents
23 the smallest region containing the entire alignment. The binning
24 scheme is essentially another representation of R-tree. A distinct bin
25 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
26 a child of Bin B if region A is contained in B.
28 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
29 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
30 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
31 find the alignments overlapped with a region [rbeg,rend), we need to
32 calculate the list of bins that may be overlapped the region and test
33 the alignments in the bins to confirm the overlaps. If the specified
34 region is short, typically only a few alignments in six bins need to
35 be retrieved. The overlapping alignments can be quickly fetched.
39 #define BAM_MIN_CHUNK_GAP 32768
40 // 1<<14 is the size of minimum bin.
41 #define BAM_LIDX_SHIFT 14
47 #define pair64_lt(a,b) ((a).u < (b).u)
48 KSORT_INIT(off, pair64_t, pair64_lt)
60 KHASH_MAP_INIT_INT(i, bam_binlist_t)
62 struct __bam_index_t {
68 // requirement: len <= LEN_MASK
69 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
74 k = kh_put(i, h, bin, &ret);
76 if (ret) { // not present
78 l->list = (pair64_t*)calloc(l->m, 16);
82 l->list = (pair64_t*)realloc(l->list, l->m * 16);
84 l->list[l->n].u = beg; l->list[l->n++].v = end;
87 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
90 beg = b->core.pos >> BAM_LIDX_SHIFT;
91 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
92 if (index2->m < end + 1) {
93 int old_m = index2->m;
95 kroundup32(index2->m);
96 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
97 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
99 for (i = beg + 1; i <= end; ++i)
100 if (index2->offset[i] == 0) index2->offset[i] = offset;
104 static void merge_chunks(bam_index_t *idx)
106 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
110 for (i = 0; i < idx->n; ++i) {
111 index = idx->index[i];
112 for (k = kh_begin(index); k != kh_end(index); ++k) {
114 if (!kh_exist(index, k)) continue;
115 p = &kh_value(index, k);
117 for (l = 1; l < p->n; ++l) {
118 #ifdef BAM_TRUE_OFFSET
119 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
121 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
123 else p->list[++m] = p->list[l];
128 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
131 bam_index_t *bam_index_core(bamFile fp)
137 uint32_t last_bin, save_bin;
138 int32_t last_coor, last_tid, save_tid;
140 uint64_t save_off, last_off;
142 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
143 b = (bam1_t*)calloc(1, sizeof(bam1_t));
144 h = bam_header_read(fp);
147 idx->n = h->n_targets;
148 bam_header_destroy(h);
149 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
150 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
151 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
153 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
154 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
155 while ((ret = bam_read1(fp, b)) >= 0) {
156 if (last_tid != c->tid) { // change of chromosomes
158 last_bin = 0xffffffffu;
159 } else if (last_coor > c->pos) {
160 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
161 bam1_qname(b), last_coor, c->pos, c->tid+1);
164 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
165 if (c->bin != last_bin) { // then possibly write the binning index
166 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
167 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
169 save_bin = last_bin = c->bin;
171 if (save_tid < 0) break;
173 if (bam_tell(fp) <= last_off) {
174 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
175 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
178 last_off = bam_tell(fp);
179 last_coor = b->core.pos;
181 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
183 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
184 free(b->data); free(b);
188 void bam_index_destroy(bam_index_t *idx)
192 if (idx == 0) return;
193 for (i = 0; i < idx->n; ++i) {
194 khash_t(i) *index = idx->index[i];
195 bam_lidx_t *index2 = idx->index2 + i;
196 for (k = kh_begin(index); k != kh_end(index); ++k) {
197 if (kh_exist(index, k))
198 free(kh_value(index, k).list);
200 kh_destroy(i, index);
201 free(index2->offset);
203 free(idx->index); free(idx->index2);
207 void bam_index_save(const bam_index_t *idx, FILE *fp)
211 fwrite("BAI\1", 1, 4, fp);
214 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
215 } else fwrite(&idx->n, 4, 1, fp);
216 for (i = 0; i < idx->n; ++i) {
217 khash_t(i) *index = idx->index[i];
218 bam_lidx_t *index2 = idx->index2 + i;
219 // write binning index
220 size = kh_size(index);
221 if (bam_is_be) { // big endian
223 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
224 } else fwrite(&size, 4, 1, fp);
225 for (k = kh_begin(index); k != kh_end(index); ++k) {
226 if (kh_exist(index, k)) {
227 bam_binlist_t *p = &kh_value(index, k);
228 if (bam_is_be) { // big endian
230 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
231 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
232 for (x = 0; (int)x < p->n; ++x) {
233 bam_swap_endian_8p(&p->list[x].u);
234 bam_swap_endian_8p(&p->list[x].v);
236 fwrite(p->list, 16, p->n, fp);
237 for (x = 0; (int)x < p->n; ++x) {
238 bam_swap_endian_8p(&p->list[x].u);
239 bam_swap_endian_8p(&p->list[x].v);
242 fwrite(&kh_key(index, k), 4, 1, fp);
243 fwrite(&p->n, 4, 1, fp);
244 fwrite(p->list, 16, p->n, fp);
248 // write linear index (index2)
251 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
252 } else fwrite(&index2->n, 4, 1, fp);
253 if (bam_is_be) { // big endian
255 for (x = 0; (int)x < index2->n; ++x)
256 bam_swap_endian_8p(&index2->offset[x]);
257 fwrite(index2->offset, 8, index2->n, fp);
258 for (x = 0; (int)x < index2->n; ++x)
259 bam_swap_endian_8p(&index2->offset[x]);
260 } else fwrite(index2->offset, 8, index2->n, fp);
265 static bam_index_t *bam_index_load_core(FILE *fp)
271 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
274 fread(magic, 1, 4, fp);
275 if (strncmp(magic, "BAI\1", 4)) {
276 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
280 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
281 fread(&idx->n, 4, 1, fp);
282 if (bam_is_be) bam_swap_endian_4p(&idx->n);
283 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
284 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
285 for (i = 0; i < idx->n; ++i) {
287 bam_lidx_t *index2 = idx->index2 + i;
292 index = idx->index[i] = kh_init(i);
293 // load binning index
294 fread(&size, 4, 1, fp);
295 if (bam_is_be) bam_swap_endian_4p(&size);
296 for (j = 0; j < (int)size; ++j) {
297 fread(&key, 4, 1, fp);
298 if (bam_is_be) bam_swap_endian_4p(&key);
299 k = kh_put(i, index, key, &ret);
300 p = &kh_value(index, k);
301 fread(&p->n, 4, 1, fp);
302 if (bam_is_be) bam_swap_endian_4p(&p->n);
304 p->list = (pair64_t*)malloc(p->m * 16);
305 fread(p->list, 16, p->n, fp);
308 for (x = 0; x < p->n; ++x) {
309 bam_swap_endian_8p(&p->list[x].u);
310 bam_swap_endian_8p(&p->list[x].v);
315 fread(&index2->n, 4, 1, fp);
316 if (bam_is_be) bam_swap_endian_4p(&index2->n);
317 index2->m = index2->n;
318 index2->offset = (uint64_t*)calloc(index2->m, 8);
319 fread(index2->offset, index2->n, 8, fp);
321 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
326 bam_index_t *bam_index_load_local(const char *_fn)
331 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
334 for (p = _fn + l - 1; p >= _fn; --p)
335 if (*p == '/') break;
337 } else fn = strdup(_fn);
338 fnidx = (char*)calloc(strlen(fn) + 5, 1);
339 strcpy(fnidx, fn); strcat(fnidx, ".bai");
340 fp = fopen(fnidx, "r");
341 if (fp == 0) { // try "{base}.bai"
342 char *s = strstr(fn, "bam");
343 if (s == fn + strlen(fn) - 3) {
345 fnidx[strlen(fn)-1] = 'i';
346 fp = fopen(fnidx, "r");
349 free(fnidx); free(fn);
351 bam_index_t *idx = bam_index_load_core(fp);
358 static void download_from_remote(const char *url)
360 const int buf_size = 1 * 1024 * 1024;
366 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
368 for (fn = (char*)url + l - 1; fn >= url; --fn)
369 if (*fn == '/') break;
370 ++fn; // fn now points to the file name
371 fp_remote = knet_open(url, "r");
372 if (fp_remote == 0) {
373 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
376 if ((fp = fopen(fn, "w")) == 0) {
377 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
378 knet_close(fp_remote);
381 buf = (uint8_t*)calloc(buf_size, 1);
382 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
383 fwrite(buf, 1, l, fp);
386 knet_close(fp_remote);
389 static void download_from_remote(const char *url)
395 bam_index_t *bam_index_load(const char *fn)
398 idx = bam_index_load_local(fn);
399 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
400 char *fnidx = calloc(strlen(fn) + 5, 1);
401 strcat(strcpy(fnidx, fn), ".bai");
402 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
403 download_from_remote(fnidx);
404 idx = bam_index_load_local(fn);
406 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
410 int bam_index_build2(const char *fn, const char *_fnidx)
416 if ((fp = bam_open(fn, "r")) == 0) {
417 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
420 idx = bam_index_core(fp);
423 fnidx = (char*)calloc(strlen(fn) + 5, 1);
424 strcpy(fnidx, fn); strcat(fnidx, ".bai");
425 } else fnidx = strdup(_fnidx);
426 fpidx = fopen(fnidx, "w");
428 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
432 bam_index_save(idx, fpidx);
433 bam_index_destroy(idx);
439 int bam_index_build(const char *fn)
441 return bam_index_build2(fn, 0);
444 int bam_index(int argc, char *argv[])
447 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
450 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
451 else bam_index_build(argv[1]);
455 #define MAX_BIN 37450 // =(8^6-1)/7+1
457 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
462 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
463 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
464 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
465 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
466 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
470 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
472 uint32_t rbeg = b->core.pos;
473 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
474 return (rend > beg && rbeg < end);
477 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
480 int i, n_bins, n_off;
486 bins = (uint16_t*)calloc(MAX_BIN, 2);
487 n_bins = reg2bins(beg, end, bins);
488 index = idx->index[tid];
489 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
490 for (i = n_off = 0; i < n_bins; ++i) {
491 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
492 n_off += kh_value(index, k).n;
495 free(bins); return 0;
497 off = (pair64_t*)calloc(n_off, 16);
498 for (i = n_off = 0; i < n_bins; ++i) {
499 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
501 bam_binlist_t *p = &kh_value(index, k);
502 for (j = 0; j < p->n; ++j)
503 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
511 b = (bam1_t*)calloc(1, sizeof(bam1_t));
512 ks_introsort(off, n_off, off);
513 // resolve completely contained adjacent blocks
514 for (i = 1, l = 0; i < n_off; ++i)
515 if (off[l].v < off[i].v)
518 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
519 for (i = 1; i < n_off; ++i)
520 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
521 { // merge adjacent blocks
522 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
523 for (i = 1, l = 0; i < n_off; ++i) {
524 #ifdef BAM_TRUE_OFFSET
525 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
527 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
529 else off[++l] = off[i];
534 // retrive alignments
535 n_seeks = 0; i = -1; curr_off = 0;
537 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
538 if (i == n_off - 1) break; // no more chunks
539 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
540 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
541 bam_seek(fp, off[i+1].u, SEEK_SET);
542 curr_off = bam_tell(fp);
547 if ((ret = bam_read1(fp, b)) > 0) {
548 curr_off = bam_tell(fp);
549 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
550 else if (is_overlap(beg, end, b)) func(b, data);
551 } else break; // end of file
553 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);