X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam_index.c;h=f250907039156c5559d92bfff9254ef0d8c401c8;hb=be1abba58b415b449a7a9230f2221fd3960781db;hp=3fa950d5235297c36c598258ce20b80183b3f04b;hpb=adefe520aeefbaf64ffdc947bbe35db4bfe9d811;p=samtools.git diff --git a/bam_index.c b/bam_index.c index 3fa950d..f250907 100644 --- a/bam_index.c +++ b/bam_index.c @@ -120,7 +120,7 @@ static void merge_chunks(bam_index_t *idx) index = idx->index[i]; for (k = kh_begin(index); k != kh_end(index); ++k) { bam_binlist_t *p; - if (!kh_exist(index, k)) continue; + if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue; p = &kh_value(index, k); m = 0; for (l = 1; l < p->n; ++l) { @@ -154,37 +154,54 @@ bam_index_t *bam_index_core(bamFile fp) bam_header_t *h; int i, ret; bam_index_t *idx; - uint32_t last_bin, save_bin; + uint32_t last_bin, save_bin, recalculated_bin; int32_t last_coor, last_tid, save_tid; bam1_core_t *c; uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; + h = bam_header_read(fp); + if(h == 0) { + fprintf(stderr, "[bam_index_core] Invalid BAM header."); + return NULL; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); b = (bam1_t*)calloc(1, sizeof(bam1_t)); - h = bam_header_read(fp); c = &b->core; idx->n = h->n_targets; - bam_header_destroy(h); idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); save_bin = save_tid = last_tid = last_bin = 0xffffffffu; save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; - n_mapped = n_unmapped = n_no_coor = off_end = 0; + n_mapped = n_unmapped = n_no_coor = off_end = 0; off_beg = off_end = bam_tell(fp); while ((ret = bam_read1(fp, b)) >= 0) { if (c->tid < 0) ++n_no_coor; - if (last_tid != c->tid) { // change of chromosomes + if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes last_tid = c->tid; last_bin = 0xffffffffu; - } else if (last_coor > c->pos) { + } else if ((uint32_t)last_tid > (uint32_t)c->tid) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n", + bam1_qname(b), last_tid+1, c->tid+1); + return NULL; + } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) { fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", bam1_qname(b), last_coor, c->pos, c->tid+1); - exit(1); + return NULL; } - if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0) { + recalculated_bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + if (c->bin != recalculated_bin) { + fprintf(stderr, "[bam_index_core] read '%s' mapped to '%s' at POS %d to %d has BIN %d but should be %d\n", + bam1_qname(b), h->target_name[c->tid], c->pos + 1, bam_calend(c, bam1_cigar(b)), c->bin, recalculated_bin); + fprintf(stderr, "[bam_index_core] Fix it by using BAM->SAM->BAM to force a recalculation of the BIN field\n"); + return NULL; + } + } + if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); @@ -203,22 +220,30 @@ bam_index_t *bam_index_core(bamFile fp) if (bam_tell(fp) <= last_off) { fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", (unsigned long long)bam_tell(fp), (unsigned long long)last_off); - exit(1); + return NULL; } if (c->flag & BAM_FUNMAP) ++n_unmapped; else ++n_mapped; last_off = bam_tell(fp); last_coor = b->core.pos; } + bam_header_destroy(h); if (save_tid >= 0) { insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); - insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp)); insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); } merge_chunks(idx); fill_missing(idx); - if (ret >= 0) - while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor; + if (ret >= 0) { + while ((ret = bam_read1(fp, b)) >= 0) { + ++n_no_coor; + if (c->tid >= 0 && n_no_coor) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n"); + return NULL; + } + } + } if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); idx->n_no_coor = n_no_coor; @@ -448,6 +473,7 @@ bam_index_t *bam_index_load(const char *fn) strcat(strcpy(fnidx, fn), ".bai"); fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); download_from_remote(fnidx); + free(fnidx); idx = bam_index_load_local(fn); } if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); @@ -466,6 +492,10 @@ int bam_index_build2(const char *fn, const char *_fnidx) } idx = bam_index_core(fp); bam_close(fp); + if(idx == 0) { + fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n"); + return -1; + } if (_fnidx == 0) { fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); @@ -474,6 +504,7 @@ int bam_index_build2(const char *fn, const char *_fnidx) if (fpidx == 0) { fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); free(fnidx); + bam_index_destroy(idx); return -1; } bam_index_save(idx, fpidx); @@ -504,7 +535,7 @@ int bam_idxstats(int argc, char *argv[]) bam_index_t *idx; bam_header_t *header; bamFile fp; - int i, no_stats = 0; + int i; if (argc < 2) { fprintf(stderr, "Usage: samtools idxstats \n"); return 1; @@ -522,12 +553,10 @@ int bam_idxstats(int argc, char *argv[]) k = kh_get(i, h, BAM_MAX_BIN); if (k != kh_end(h)) printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v); - else no_stats = 1; + else printf("\t0\t0"); putchar('\n'); } - printf("*\t0"); - if (!no_stats) printf("\t0\t%llu", (long long)idx->n_no_coor); - putchar('\n'); + printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor); bam_header_destroy(header); bam_index_destroy(idx); return 0; @@ -610,6 +639,9 @@ bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) } } free(bins); + if (n_off == 0) { + free(off); return iter; + } { bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); int l; @@ -658,17 +690,17 @@ void bam_iter_destroy(bam_iter_t iter) int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) { - if (iter->finished) return -1; - if (iter->from_first) { - int ret = bam_read1(fp, b); - if (ret < 0) iter->finished = 1; + int ret; + if (iter && iter->finished) return -1; + if (iter == 0 || iter->from_first) { + ret = bam_read1(fp, b); + if (ret < 0 && iter) iter->finished = 1; return ret; } if (iter->off == 0) return -1; for (;;) { - int ret; if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk - if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); @@ -676,23 +708,28 @@ int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) } ++iter->i; } - if ((ret = bam_read1(fp, b)) > 0) { + if ((ret = bam_read1(fp, b)) >= 0) { iter->curr_off = bam_tell(fp); - if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed + if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed + ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error + break; + } else if (is_overlap(iter->beg, iter->end, b)) return ret; - } else break; // end of file + } else break; // end of file or error } iter->finished = 1; - return -1; + return ret; } int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) { + int ret; bam_iter_t iter; bam1_t *b; b = bam_init1(); iter = bam_iter_query(idx, tid, beg, end); - while (bam_iter_read(fp, iter, b) >= 0) func(b, data); + while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); + bam_iter_destroy(iter); bam_destroy1(b); - return 0; + return (ret == -1)? 0 : ret; }