From 78b65b0169e509b47f2e0bdf70a7b8def0b17434 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 11 Jun 2010 16:00:41 +0000 Subject: [PATCH] * samtools-0.1.7-7 (r584) * ported tabix BGZF to samtools * flush BGZF after writing the BAM header and between alignment boundaries --- bam.c | 2 ++ bamtk.c | 2 +- bgzf.c | 58 +++++++++++++++++++++------------------------- bgzf.h | 25 +++++++++++++++++++- kstring.h | 17 ++++++++++++++ misc/wgsim_eval.pl | 2 +- 6 files changed, 71 insertions(+), 35 deletions(-) diff --git a/bam.c b/bam.c index 35e5863..86cf3f3 100644 --- a/bam.c +++ b/bam.c @@ -141,6 +141,7 @@ int bam_header_write(bamFile fp, const bam_header_t *header) bam_write(fp, &x, 4); } else bam_write(fp, &header->target_len[i], 4); } + bgzf_flush(fp); return 0; } @@ -208,6 +209,7 @@ inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8 x[5] = c->mtid; x[6] = c->mpos; x[7] = c->isize; + bgzf_flush_try(fp, 4 + block_len); if (bam_is_be) { for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); y = block_len; diff --git a/bamtk.c b/bamtk.c index 902bd80..22c5648 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.7-6 (r530)" +#define PACKAGE_VERSION "0.1.7-7 (r584)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bgzf.c b/bgzf.c index 59f902f..27dfe3d 100644 --- a/bgzf.c +++ b/bgzf.c @@ -429,9 +429,8 @@ static void cache_block(BGZF *fp, int size) memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); } -static int -read_block(BGZF* fp) +bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; int size = 0; @@ -501,7 +500,7 @@ bgzf_read(BGZF* fp, void* data, int length) while (bytes_read < length) { int available = fp->block_length - fp->block_offset; if (available <= 0) { - if (read_block(fp) != 0) { + if (bgzf_read_block(fp) != 0) { return -1; } available = fp->block_length - fp->block_offset; @@ -528,19 +527,16 @@ bgzf_read(BGZF* fp, void* data, int length) return bytes_read; } -static -int -flush_block(BGZF* fp) +int bgzf_flush(BGZF* fp) { while (fp->block_offset > 0) { - int block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) { - return -1; - } + int count, block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; #ifdef _USE_KNETFILE - int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); #else - int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + count = fwrite(fp->compressed_block, 1, block_length, fp->file); #endif if (count != block_length) { report_error(fp, "write failed"); @@ -551,17 +547,22 @@ flush_block(BGZF* fp) return 0; } -int -bgzf_write(BGZF* fp, const void* data, int length) +int bgzf_flush_try(BGZF *fp, int size) +{ + if (fp->block_offset + size > fp->uncompressed_block_size) + return bgzf_flush(fp); + return -1; +} + +int bgzf_write(BGZF* fp, const void* data, int length) { if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; } - if (fp->uncompressed_block == NULL) { + if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - } const bgzf_byte_t* input = data; int block_length = fp->uncompressed_block_size; @@ -574,7 +575,7 @@ bgzf_write(BGZF* fp, const void* data, int length) input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length) { - if (flush_block(fp) != 0) { + if (bgzf_flush(fp) != 0) { break; } } @@ -582,13 +583,10 @@ bgzf_write(BGZF* fp, const void* data, int length) return bytes_written; } -int -bgzf_close(BGZF* fp) +int bgzf_close(BGZF* fp) { if (fp->open_mode == 'w') { - if (flush_block(fp) != 0) { - return -1; - } + if (bgzf_flush(fp) != 0) return -1; { // add an empty block int count, block_length = deflate_block(fp, 0); #ifdef _USE_KNETFILE @@ -625,12 +623,6 @@ bgzf_close(BGZF* fp) return 0; } -int64_t -bgzf_tell(BGZF* fp) -{ - return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); -} - void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; @@ -655,9 +647,11 @@ int bgzf_check_EOF(BGZF *fp) return (memcmp(magic, buf, 28) == 0)? 1 : 0; } -int64_t -bgzf_seek(BGZF* fp, int64_t pos, int where) +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { + int block_offset; + int64_t block_address; + if (fp->open_mode != 'r') { report_error(fp, "file not open for read"); return -1; @@ -666,8 +660,8 @@ bgzf_seek(BGZF* fp, int64_t pos, int where) report_error(fp, "unimplemented seek option"); return -1; } - int block_offset = pos & 0xFFFF; - int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + block_offset = pos & 0xFFFF; + block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; #ifdef _USE_KNETFILE if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { #else diff --git a/bgzf.h b/bgzf.h index 91b3317..099ae9a 100644 --- a/bgzf.h +++ b/bgzf.h @@ -106,7 +106,7 @@ int bgzf_write(BGZF* fp, const void* data, int length); * Return value is non-negative on success. * Returns -1 on error. */ -int64_t bgzf_tell(BGZF* fp); +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) /* * Set the file to read from the location specified by pos, which must @@ -126,9 +126,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); void bgzf_set_cache_size(BGZF *fp, int cache_size); int bgzf_check_EOF(BGZF *fp); +int bgzf_read_block(BGZF* fp); +int bgzf_flush(BGZF* fp); +int bgzf_flush_try(BGZF *fp, int size); #ifdef __cplusplus } #endif +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + #endif diff --git a/kstring.h b/kstring.h index f4e5a99..918b020 100644 --- a/kstring.h +++ b/kstring.h @@ -58,6 +58,23 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + static inline int *ksplit(kstring_t *s, int delimiter, int *n) { int max = 0, *offsets = 0; diff --git a/misc/wgsim_eval.pl b/misc/wgsim_eval.pl index d3d9c61..f919a06 100755 --- a/misc/wgsim_eval.pl +++ b/misc/wgsim_eval.pl @@ -78,7 +78,7 @@ sub wgsim_eval { $c0[$i] = 0 unless (defined $c0[$i]); $c1[$i] = 0 unless (defined $c1[$i]); $cc0 += $c0[$i]; $cc1 += $c1[$i]; - printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); } } else { for (reverse(sort {$a<=>$b} (keys %fnfp))) { -- 2.39.2