X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam_index.c;h=6fffd75e1ed5c58c363847abd42fed04e638c5b0;hb=a958954399757774ee26bfcf0b5b95e9ec9b62f4;hp=725d053b7f44a667a178aff67473e0c6376e1444;hpb=0a5854b381b30670ba749b8575bc85548a10578d;p=samtools.git diff --git a/bam_index.c b/bam_index.c index 725d053..6fffd75 100644 --- a/bam_index.c +++ b/bam_index.c @@ -4,7 +4,9 @@ #include "khash.h" #include "ksort.h" #include "bam_endian.h" +#ifdef _USE_KNETFILE #include "knetfile.h" +#endif /*! @header @@ -96,8 +98,12 @@ static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); } - for (i = beg + 1; i <= end; ++i) - if (index2->offset[i] == 0) index2->offset[i] = offset; + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } index2->n = end + 1; } @@ -128,6 +134,17 @@ static void merge_chunks(bam_index_t *idx) #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) } +static void fill_missing(bam_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bam_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + bam_index_t *bam_index_core(bamFile fp) { bam1_t *b; @@ -161,7 +178,7 @@ bam_index_t *bam_index_core(bamFile fp) bam1_qname(b), last_coor, c->pos, c->tid+1); exit(1); } - if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); + insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); @@ -180,6 +197,7 @@ bam_index_t *bam_index_core(bamFile fp) } if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); merge_chunks(idx); + fill_missing(idx); if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); return idx; @@ -337,13 +355,13 @@ bam_index_t *bam_index_load_local(const char *_fn) } else fn = strdup(_fn); fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); if (fp == 0) { // try "{base}.bai" char *s = strstr(fn, "bam"); if (s == fn + strlen(fn) - 3) { strcpy(fnidx, fn); fnidx[strlen(fn)-1] = 'i'; - fp = fopen(fnidx, "r"); + fp = fopen(fnidx, "rb"); } } free(fnidx); free(fn); @@ -354,6 +372,7 @@ bam_index_t *bam_index_load_local(const char *_fn) } else return 0; } +#ifdef _USE_KNETFILE static void download_from_remote(const char *url) { const int buf_size = 1 * 1024 * 1024; @@ -372,7 +391,7 @@ static void download_from_remote(const char *url) fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); return; } - if ((fp = fopen(fn, "w")) == 0) { + if ((fp = fopen(fn, "wb")) == 0) { fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); knet_close(fp_remote); return; @@ -384,6 +403,12 @@ static void download_from_remote(const char *url) fclose(fp); knet_close(fp_remote); } +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif bam_index_t *bam_index_load(const char *fn) { @@ -416,7 +441,7 @@ int bam_index_build2(const char *fn, const char *_fnidx) fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); } else fnidx = strdup(_fnidx); - fpidx = fopen(fnidx, "w"); + fpidx = fopen(fnidx, "wb"); if (fpidx == 0) { fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); free(fnidx); @@ -450,6 +475,8 @@ int bam_index(int argc, char *argv[]) static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) { int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; --end; list[i++] = 0; for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; @@ -467,7 +494,15 @@ static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) return (rend > beg && rbeg < end); } -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +struct __bam_iter_t { + int from_first; // read from the first record; no random access + int tid, beg, end, n_off, i, finished; + uint64_t curr_off; + pair64_t *off; +}; + +// bam_fetch helper function retrieves +bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) { uint16_t *bins; int i, n_bins, n_off; @@ -475,17 +510,34 @@ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, voi khint_t k; khash_t(i) *index; uint64_t min_off; - + bam_iter_t iter = 0; + + if (beg < 0) beg = 0; + if (end < beg) return 0; + // initialize iter + iter = calloc(1, sizeof(struct __bam_iter_t)); + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + // bins = (uint16_t*)calloc(MAX_BIN, 2); n_bins = reg2bins(beg, end, bins); index = idx->index[tid]; - min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (idx->index2[tid].n > 0) { + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 + int n = beg>>BAM_LIDX_SHIFT; + if (n > idx->index2[tid].n) n = idx->index2[tid].n; + for (i = n - 1; i >= 0; --i) + if (idx->index2[tid].offset[i] != 0) break; + if (i >= 0) min_off = idx->index2[tid].offset[i]; + } + } else min_off = 0; // tabix 0.1.2 may produce such index files for (i = n_off = 0; i < n_bins; ++i) { if ((k = kh_get(i, index, bins[i])) != kh_end(index)) n_off += kh_value(index, k).n; } if (n_off == 0) { - free(bins); return 0; + free(bins); return iter; } off = (pair64_t*)calloc(n_off, 16); for (i = n_off = 0; i < n_bins; ++i) { @@ -498,10 +550,8 @@ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, voi } free(bins); { - bam1_t *b; - int l, ret, n_seeks; - uint64_t curr_off; - b = (bam1_t*)calloc(1, sizeof(bam1_t)); + bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); + int l; ks_introsort(off, n_off, off); // resolve completely contained adjacent blocks for (i = 1, l = 0; i < n_off; ++i) @@ -524,28 +574,64 @@ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, voi n_off = l + 1; #endif } - // retrive alignments - n_seeks = 0; i = -1; curr_off = 0; - for (;;) { - if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk - if (i == n_off - 1) break; // no more chunks - if (i >= 0) assert(curr_off == off[i].v); // otherwise bug - if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek - bam_seek(fp, off[i+1].u, SEEK_SET); - curr_off = bam_tell(fp); - ++n_seeks; - } - ++i; + bam_destroy1(b); + } + iter->n_off = n_off; iter->off = off; + return iter; +} + +pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off) +{ // for pysam compatibility + bam_iter_t iter; + pair64_t *off; + iter = bam_iter_query(idx, tid, beg, end); + off = iter->off; *cnt_off = iter->n_off; + free(iter); + return off; +} + +void bam_iter_destroy(bam_iter_t iter) +{ + if (iter) { free(iter->off); free(iter); } +} + +int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) +{ + if (iter->finished) return -1; + if (iter->from_first) { + int ret = bam_read1(fp, b); + if (ret < 0) iter->finished = 1; + return ret; + } + if (iter->off == 0) return -1; + for (;;) { + int ret; + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bam_tell(fp); } - if ((ret = bam_read1(fp, b)) > 0) { - curr_off = bam_tell(fp); - if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed - else if (is_overlap(beg, end, b)) func(b, data); - } else break; // end of file + ++iter->i; } -// fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); - bam_destroy1(b); + if ((ret = bam_read1(fp, b)) > 0) { + iter->curr_off = bam_tell(fp); + if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed + else if (is_overlap(iter->beg, iter->end, b)) return ret; + } else break; // end of file } - free(off); + iter->finished = 1; + return -1; +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + bam_iter_t iter; + bam1_t *b; + b = bam_init1(); + iter = bam_iter_query(idx, tid, beg, end); + while (bam_iter_read(fp, iter, b) >= 0) func(b, data); + bam_destroy1(b); return 0; }