]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.7-10 (r587)
authorHeng Li <lh3@live.co.uk>
Fri, 11 Jun 2010 21:33:16 +0000 (21:33 +0000)
committerHeng Li <lh3@live.co.uk>
Fri, 11 Jun 2010 21:33:16 +0000 (21:33 +0000)
 * added iterator interface for bam_fetch (ported back from tabix)

bam.h
bam_index.c
bamtk.c

diff --git a/bam.h b/bam.h
index 8d6c43142c3a3b837c392b1a36b10081dae0acbc..86939b2e85d48edb0c55e0d3c2a2b35234f69a7b 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -190,6 +190,8 @@ typedef struct {
        uint8_t *data;
 } bam1_t;
 
+typedef struct __bam_iterf_t *bam_iterf_t;
+
 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
 
@@ -582,6 +584,10 @@ extern "C" {
         */
        int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
 
+       bam_iterf_t bam_iterf_query(const bam_index_t *idx, int tid, int beg, int end);
+       int bam_iterf_read(bamFile fp, bam_iterf_t iter, bam1_t *b);
+       void bam_iterf_destroy(bam_iterf_t iter);
+
        /*!
          @abstract       Parse a region in the format: "chr2:100,000-200,000".
          @discussion     bam_header_t::hash will be initialized if empty.
index 853ac705fc9bef5a653ab9aae9eb9100ec551e12..570d64585af202fd334251034fa773f388e9aff3 100644 (file)
@@ -476,8 +476,16 @@ static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
        return (rend > beg && rbeg < end);
 }
 
+struct __bam_iterf_t {
+       int from_first; // read from the first record; no random access
+       int tid, beg, end, n_off, i, finished;
+       uint64_t curr_off;
+       const bam_index_t *idx;
+       pair64_t *off;
+};
+
 // bam_fetch helper function retrieves 
-pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off)
+bam_iterf_t bam_iterf_query(const bam_index_t *idx, int tid, int beg, int end)
 {
        uint16_t *bins;
        int i, n_bins, n_off;
@@ -485,7 +493,14 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e
        khint_t k;
        khash_t(i) *index;
        uint64_t min_off;
-
+       bam_iterf_t iter = 0;
+
+       if (beg < 0) beg = 0;
+       if (end < beg) return 0;
+       // initialize iter
+       iter = calloc(1, sizeof(struct __bam_iterf_t));
+       iter->idx = idx; iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+       //
        bins = (uint16_t*)calloc(MAX_BIN, 2);
        n_bins = reg2bins(beg, end, bins);
        index = idx->index[tid];
@@ -495,7 +510,7 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e
                        n_off += kh_value(index, k).n;
        }
        if (n_off == 0) {
-               free(bins); return 0;
+               free(bins); return iter;
        }
        off = (pair64_t*)calloc(n_off, 16);
        for (i = n_off = 0; i < n_bins; ++i) {
@@ -534,41 +549,62 @@ pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int e
                }
                bam_destroy1(b);
        }
-       *cnt_off = n_off;
+       iter->n_off = n_off; iter->off = off;
+       return iter;
+}
+
+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
+{ // for pysam compatibility
+       bam_iterf_t iter;
+       pair64_t *off;
+       iter = bam_iterf_query(idx, tid, beg, end);
+       off = iter->off; *cnt_off = iter->n_off;
+       free(iter);
        return off;
 }
 
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+void bam_iterf_destroy(bam_iterf_t iter)
 {
-       int n_off;
-       pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
-       if (off == 0) return 0;
-       {
-               // retrive alignments
-               uint64_t curr_off;
-               int i, ret, n_seeks;
-               n_seeks = 0; i = -1; curr_off = 0;
-               bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
-               for (;;) {
-                       if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
-                               if (i == n_off - 1) break; // no more chunks
-                               if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
-                               if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
-                                       bam_seek(fp, off[i+1].u, SEEK_SET);
-                                       curr_off = bam_tell(fp);
-                                       ++n_seeks;
-                               }
-                               ++i;
+       if (iter) { free(iter->off); free(iter); }
+}
+
+int bam_iterf_read(bamFile fp, bam_iterf_t iter, bam1_t *b)
+{
+       if (iter->finished) return -1;
+       if (iter->from_first) {
+               int ret = bam_read1(fp, b);
+               if (ret < 0) iter->finished = 1;
+               return ret;
+       }
+       if (iter->off == 0) return -1;
+       for (;;) {
+               int ret;
+               if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+                       if (iter->i == iter->n_off - 1) break; // no more chunks
+                       if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
+                       if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+                               bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+                               iter->curr_off = bam_tell(fp);
                        }
-                       if ((ret = bam_read1(fp, b)) > 0) {
-                               curr_off = bam_tell(fp);
-                               if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
-                               else if (is_overlap(beg, end, b)) func(b, data);
-                       } else break; // end of file
+                       ++iter->i;
                }
-//             fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
-               bam_destroy1(b);
+               if ((ret = bam_read1(fp, b)) > 0) {
+                       iter->curr_off = bam_tell(fp);
+                       if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
+                       else if (is_overlap(iter->beg, iter->end, b)) return ret;
+               } else break; // end of file
        }
-       free(off);
+       iter->finished = 1;
+       return -1;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+       bam_iterf_t iter;
+       bam1_t *b;
+       b = bam_init1();
+       iter = bam_iterf_query(idx, tid, beg, end);
+       while (bam_iterf_read(fp, iter, b) >= 0) func(b, data);
+       bam_destroy1(b);
        return 0;
 }
diff --git a/bamtk.c b/bamtk.c
index 9edb6cb8402032cfa1cdbc7a97ce6b8c853914f1..405b4a3346842a196dd5597017dbe28ce85e776c 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.7-9 (r586)"
+#define PACKAGE_VERSION "0.1.7-10 (r587)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);