5 #include "bam_endian.h"
10 Alignment indexing. Before indexing, BAM must be sorted based on the
11 leftmost coordinate of alignments. In indexing, BAM uses two indices:
12 a UCSC binning index and a simple linear index. The binning index is
13 efficient for alignments spanning long distance, while the auxiliary
14 linear index helps to reduce unnecessary seek calls especially for
17 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
18 Stein and is explained by Kent et al. (2002). In this scheme, each bin
19 represents a contiguous genomic region which can be fully contained in
20 another bin; each alignment is associated with a bin which represents
21 the smallest region containing the entire alignment. The binning
22 scheme is essentially another representation of R-tree. A distinct bin
23 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
24 a child of Bin B if region A is contained in B.
26 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
27 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
28 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
29 find the alignments overlapped with a region [rbeg,rend), we need to
30 calculate the list of bins that may be overlapped the region and test
31 the alignments in the bins to confirm the overlaps. If the specified
32 region is short, typically only a few alignments in six bins need to
33 be retrieved. The overlapping alignments can be quickly fetched.
37 #define BAM_MIN_CHUNK_GAP 32768
38 // 1<<14 is the size of minimum bin.
39 #define BAM_LIDX_SHIFT 14
45 #define pair64_lt(a,b) ((a).u < (b).u)
46 KSORT_INIT(off, pair64_t, pair64_lt)
58 KHASH_MAP_INIT_INT(i, bam_binlist_t)
60 struct __bam_index_t {
66 // requirement: len <= LEN_MASK
67 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
72 k = kh_put(i, h, bin, &ret);
74 if (ret) { // not present
76 l->list = (pair64_t*)calloc(l->m, 16);
80 l->list = (pair64_t*)realloc(l->list, l->m * 16);
82 l->list[l->n].u = beg; l->list[l->n++].v = end;
85 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
88 beg = b->core.pos >> BAM_LIDX_SHIFT;
89 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
90 if (index2->m < end + 1) {
91 int old_m = index2->m;
93 kroundup32(index2->m);
94 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
95 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
97 for (i = beg + 1; i <= end; ++i)
98 if (index2->offset[i] == 0) index2->offset[i] = offset;
102 static void merge_chunks(bam_index_t *idx)
104 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
108 for (i = 0; i < idx->n; ++i) {
109 index = idx->index[i];
110 for (k = kh_begin(index); k != kh_end(index); ++k) {
112 if (!kh_exist(index, k)) continue;
113 p = &kh_value(index, k);
115 for (l = 1; l < p->n; ++l) {
116 #ifdef BAM_TRUE_OFFSET
117 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
119 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
121 else p->list[++m] = p->list[l];
126 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
129 bam_index_t *bam_index_core(bamFile fp)
135 uint32_t last_bin, save_bin;
136 int32_t last_coor, last_tid, save_tid;
138 uint64_t save_off, last_off;
140 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
141 b = (bam1_t*)calloc(1, sizeof(bam1_t));
142 h = bam_header_read(fp);
145 idx->n = h->n_targets;
146 bam_header_destroy(h);
147 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
148 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
149 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
151 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
152 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
153 while ((ret = bam_read1(fp, b)) >= 0) {
154 if (last_tid != c->tid) { // change of chromosomes
156 last_bin = 0xffffffffu;
157 } else if (last_coor > c->pos) {
158 fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
161 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
162 if (c->bin != last_bin) { // then possibly write the binning index
163 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
164 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
166 save_bin = last_bin = c->bin;
168 if (save_tid < 0) break;
170 if (bam_tell(fp) <= last_off) {
171 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
172 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
175 last_off = bam_tell(fp);
176 last_coor = b->core.pos;
178 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
180 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
181 free(b->data); free(b);
185 void bam_index_destroy(bam_index_t *idx)
189 if (idx == 0) return;
190 for (i = 0; i < idx->n; ++i) {
191 khash_t(i) *index = idx->index[i];
192 bam_lidx_t *index2 = idx->index2 + i;
193 for (k = kh_begin(index); k != kh_end(index); ++k) {
194 if (kh_exist(index, k))
195 free(kh_value(index, k).list);
197 kh_destroy(i, index);
198 free(index2->offset);
200 free(idx->index); free(idx->index2);
204 void bam_index_save(const bam_index_t *idx, FILE *fp)
208 fwrite("BAI\1", 1, 4, fp);
211 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
212 } else fwrite(&idx->n, 4, 1, fp);
213 for (i = 0; i < idx->n; ++i) {
214 khash_t(i) *index = idx->index[i];
215 bam_lidx_t *index2 = idx->index2 + i;
216 // write binning index
217 size = kh_size(index);
218 if (bam_is_be) { // big endian
220 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
221 } else fwrite(&size, 4, 1, fp);
222 for (k = kh_begin(index); k != kh_end(index); ++k) {
223 if (kh_exist(index, k)) {
224 bam_binlist_t *p = &kh_value(index, k);
225 if (bam_is_be) { // big endian
227 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
228 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
229 for (x = 0; (int)x < p->n; ++x) {
230 bam_swap_endian_8p(&p->list[x].u);
231 bam_swap_endian_8p(&p->list[x].v);
233 fwrite(p->list, 16, p->n, fp);
234 for (x = 0; (int)x < p->n; ++x) {
235 bam_swap_endian_8p(&p->list[x].u);
236 bam_swap_endian_8p(&p->list[x].v);
239 fwrite(&kh_key(index, k), 4, 1, fp);
240 fwrite(&p->n, 4, 1, fp);
241 fwrite(p->list, 16, p->n, fp);
245 // write linear index (index2)
248 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
249 } else fwrite(&index2->n, 4, 1, fp);
250 if (bam_is_be) { // big endian
252 for (x = 0; (int)x < index2->n; ++x)
253 bam_swap_endian_8p(&index2->offset[x]);
254 fwrite(index2->offset, 8, index2->n, fp);
255 for (x = 0; (int)x < index2->n; ++x)
256 bam_swap_endian_8p(&index2->offset[x]);
257 } else fwrite(index2->offset, 8, index2->n, fp);
262 bam_index_t *bam_index_load(const char *fn)
267 char *fnidx, magic[4];
269 fnidx = (char*)calloc(strlen(fn) + 5, 1);
270 strcpy(fnidx, fn); strcat(fnidx, ".bai");
271 if ((fp = fopen(fnidx, "r")) == 0) {
272 fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
277 fread(magic, 1, 4, fp);
278 if (strncmp(magic, "BAI\1", 4)) {
279 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
283 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
284 fread(&idx->n, 4, 1, fp);
285 if (bam_is_be) bam_swap_endian_4p(&idx->n);
286 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
287 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
288 for (i = 0; i < idx->n; ++i) {
290 bam_lidx_t *index2 = idx->index2 + i;
295 index = idx->index[i] = kh_init(i);
296 // load binning index
297 fread(&size, 4, 1, fp);
298 if (bam_is_be) bam_swap_endian_4p(&size);
299 for (j = 0; j < (int)size; ++j) {
300 fread(&key, 4, 1, fp);
301 if (bam_is_be) bam_swap_endian_4p(&key);
302 k = kh_put(i, index, key, &ret);
303 p = &kh_value(index, k);
304 fread(&p->n, 4, 1, fp);
305 if (bam_is_be) bam_swap_endian_4p(&p->n);
307 p->list = (pair64_t*)malloc(p->m * 16);
308 fread(p->list, 16, p->n, fp);
311 for (x = 0; x < p->n; ++x) {
312 bam_swap_endian_8p(&p->list[x].u);
313 bam_swap_endian_8p(&p->list[x].v);
318 fread(&index2->n, 4, 1, fp);
319 if (bam_is_be) bam_swap_endian_4p(&index2->n);
320 index2->m = index2->n;
321 index2->offset = (uint64_t*)calloc(index2->m, 8);
322 fread(index2->offset, index2->n, 8, fp);
324 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
330 int bam_index_build(const char *fn)
336 assert(fp = bam_open(fn, "r"));
337 idx = bam_index_core(fp);
339 fnidx = (char*)calloc(strlen(fn) + 5, 1);
340 strcpy(fnidx, fn); strcat(fnidx, ".bai");
341 assert(fpidx = fopen(fnidx, "w"));
342 bam_index_save(idx, fpidx);
343 bam_index_destroy(idx);
349 int bam_index(int argc, char *argv[])
352 fprintf(stderr, "Usage: samtools index <in.bam>\n");
355 bam_index_build(argv[1]);
359 #define MAX_BIN 37450 // =(8^6-1)/7+1
361 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
366 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
367 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
368 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
369 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
370 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
374 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
376 uint32_t rbeg = b->core.pos;
377 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
378 return (rend > beg && rbeg < end);
381 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
384 int i, n_bins, n_off;
390 bins = (uint16_t*)calloc(MAX_BIN, 2);
391 n_bins = reg2bins(beg, end, bins);
392 index = idx->index[tid];
393 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
394 for (i = n_off = 0; i < n_bins; ++i) {
395 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
396 n_off += kh_value(index, k).n;
399 free(bins); return 0;
401 off = (pair64_t*)calloc(n_off, 16);
402 for (i = n_off = 0; i < n_bins; ++i) {
403 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
405 bam_binlist_t *p = &kh_value(index, k);
406 for (j = 0; j < p->n; ++j)
407 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
415 b = (bam1_t*)calloc(1, sizeof(bam1_t));
416 ks_introsort(off, n_off, off);
417 // resolve completely contained adjacent blocks
418 for (i = 1, l = 0; i < n_off; ++i)
419 if (off[l].v < off[i].v)
422 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
423 for (i = 1; i < n_off; ++i)
424 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
425 { // merge adjacent blocks
426 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
427 for (i = 1, l = 0; i < n_off; ++i) {
428 #ifdef BAM_TRUE_OFFSET
429 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
431 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
433 else off[++l] = off[i];
438 // retrive alignments
439 n_seeks = 0; i = -1; curr_off = 0;
441 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
442 if (i == n_off - 1) break; // no more chunks
443 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
444 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
445 bam_seek(fp, off[i+1].u, SEEK_SET);
446 curr_off = bam_tell(fp);
451 if ((ret = bam_read1(fp, b)) > 0) {
452 curr_off = bam_tell(fp);
453 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
454 else if (is_overlap(beg, end, b)) func(b, data);
455 } else break; // end of file
457 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);