3 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
4 * This software and its documentation are copyright 2008 by the
5 * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
7 * This software is supplied without any warranty or guaranteed support whatsoever.
8 * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
13 2009-06-29 by lh3: cache recent uncompressed blocks.
14 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
15 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
22 #include <sys/types.h>
32 KHASH_MAP_INIT_INT64(cache, cache_t)
34 #if defined(_WIN32) || defined(_MSC_VER)
35 #define ftello(fp) ftell(fp)
36 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
38 extern off_t ftello(FILE *stream);
39 extern int fseeko(FILE *stream, off_t offset, int whence);
42 typedef int8_t bgzf_byte_t;
44 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
45 static const int MAX_BLOCK_SIZE = 64 * 1024;
47 static const int BLOCK_HEADER_LENGTH = 18;
48 static const int BLOCK_FOOTER_LENGTH = 8;
50 static const int GZIP_ID1 = 31;
51 static const int GZIP_ID2 = 139;
52 static const int CM_DEFLATE = 8;
53 static const int FLG_FEXTRA = 4;
54 static const int OS_UNKNOWN = 255;
55 static const int BGZF_ID1 = 66; // 'B'
56 static const int BGZF_ID2 = 67; // 'C'
57 static const int BGZF_LEN = 2;
58 static const int BGZF_XLEN = 6; // BGZF_LEN+4
60 static const int GZIP_WINDOW_BITS = -15; // no zlib header
61 static const int Z_DEFAULT_MEM_LEVEL = 8;
66 packInt16(uint8_t* buffer, uint16_t value)
69 buffer[1] = value >> 8;
74 unpackInt16(const uint8_t* buffer)
76 return (buffer[0] | (buffer[1] << 8));
81 packInt32(uint8_t* buffer, uint32_t value)
84 buffer[1] = value >> 8;
85 buffer[2] = value >> 16;
86 buffer[3] = value >> 24;
91 bgzf_min(int x, int y)
93 return (x < y) ? x : y;
98 report_error(BGZF* fp, const char* message) {
102 static BGZF *bgzf_read_init()
105 fp = calloc(1, sizeof(BGZF));
106 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
107 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
108 fp->compressed_block_size = MAX_BLOCK_SIZE;
109 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
111 fp->cache = kh_init(cache);
120 knetFile *file = knet_dopen(fd, "r");
122 FILE* file = fdopen(fd, "r");
125 if (file == 0) return 0;
126 fp = bgzf_read_init();
127 fp->file_descriptor = fd;
139 open_write(int fd, bool is_uncompressed)
141 FILE* file = fdopen(fd, "w");
143 if (file == 0) return 0;
144 fp = malloc(sizeof(BGZF));
145 fp->file_descriptor = fd;
147 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
153 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
154 fp->uncompressed_block = NULL;
155 fp->compressed_block_size = MAX_BLOCK_SIZE;
156 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
157 fp->block_address = 0;
158 fp->block_offset = 0;
159 fp->block_length = 0;
165 bgzf_open(const char* __restrict path, const char* __restrict mode)
168 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
170 knetFile *file = knet_open(path, mode);
171 if (file == 0) return 0;
172 fp = bgzf_read_init();
173 fp->file_descriptor = -1;
177 int fd, oflag = O_RDONLY;
181 fd = open(path, oflag);
182 if (fd == -1) return 0;
185 } else if (mode[0] == 'w' || mode[0] == 'W') {
186 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
190 fd = open(path, oflag, 0644);
191 if (fd == -1) return 0;
192 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
201 bgzf_fdopen(int fd, const char * __restrict mode)
203 if (fd == -1) return 0;
204 if (mode[0] == 'r' || mode[0] == 'R') {
205 return open_read(fd);
206 } else if (mode[0] == 'w' || mode[0] == 'W') {
207 return open_write(fd, strstr(mode, "u")? 1 : 0);
215 deflate_block(BGZF* fp, int block_length)
217 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
218 // Also adds an extra field that stores the compressed block length.
220 bgzf_byte_t* buffer = fp->compressed_block;
221 int buffer_size = fp->compressed_block_size;
224 buffer[0] = GZIP_ID1;
225 buffer[1] = GZIP_ID2;
226 buffer[2] = CM_DEFLATE;
227 buffer[3] = FLG_FEXTRA;
228 buffer[4] = 0; // mtime
233 buffer[9] = OS_UNKNOWN;
234 buffer[10] = BGZF_XLEN;
236 buffer[12] = BGZF_ID1;
237 buffer[13] = BGZF_ID2;
238 buffer[14] = BGZF_LEN;
240 buffer[16] = 0; // placeholder for block length
243 // loop to retry for blocks that do not compress enough
244 int input_length = block_length;
245 int compressed_length = 0;
247 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
251 zs.next_in = fp->uncompressed_block;
252 zs.avail_in = input_length;
253 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
254 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
256 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
257 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
258 if (status != Z_OK) {
259 report_error(fp, "deflate init failed");
262 status = deflate(&zs, Z_FINISH);
263 if (status != Z_STREAM_END) {
265 if (status == Z_OK) {
266 // Not enough space in buffer.
267 // Can happen in the rare case the input doesn't compress enough.
268 // Reduce the amount of input until it fits.
269 input_length -= 1024;
270 if (input_length <= 0) {
271 // should never happen
272 report_error(fp, "input reduction failed");
277 report_error(fp, "deflate failed");
280 status = deflateEnd(&zs);
281 if (status != Z_OK) {
282 report_error(fp, "deflate end failed");
285 compressed_length = zs.total_out;
286 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
287 if (compressed_length > MAX_BLOCK_SIZE) {
288 // should never happen
289 report_error(fp, "deflate overflow");
295 packInt16((uint8_t*)&buffer[16], compressed_length-1);
296 uint32_t crc = crc32(0L, NULL, 0L);
297 crc = crc32(crc, fp->uncompressed_block, input_length);
298 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
299 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
301 int remaining = block_length - input_length;
303 if (remaining > input_length) {
304 // should never happen (check so we can use memcpy)
305 report_error(fp, "remainder too large");
308 memcpy(fp->uncompressed_block,
309 fp->uncompressed_block + input_length,
312 fp->block_offset = remaining;
313 return compressed_length;
318 inflate_block(BGZF* fp, int block_length)
320 // Inflate the block in fp->compressed_block into fp->uncompressed_block
325 zs.next_in = fp->compressed_block + 18;
326 zs.avail_in = block_length - 16;
327 zs.next_out = fp->uncompressed_block;
328 zs.avail_out = fp->uncompressed_block_size;
330 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
331 if (status != Z_OK) {
332 report_error(fp, "inflate init failed");
335 status = inflate(&zs, Z_FINISH);
336 if (status != Z_STREAM_END) {
338 report_error(fp, "inflate failed");
341 status = inflateEnd(&zs);
342 if (status != Z_OK) {
343 report_error(fp, "inflate failed");
351 check_header(const bgzf_byte_t* header)
353 return (header[0] == GZIP_ID1 &&
354 header[1] == (bgzf_byte_t) GZIP_ID2 &&
355 header[2] == Z_DEFLATED &&
356 (header[3] & FLG_FEXTRA) != 0 &&
357 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
358 header[12] == BGZF_ID1 &&
359 header[13] == BGZF_ID2 &&
360 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
363 static void free_cache(BGZF *fp)
366 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
367 if (fp->open_mode != 'r') return;
368 for (k = kh_begin(h); k < kh_end(h); ++k)
369 if (kh_exist(h, k)) free(kh_val(h, k).block);
370 kh_destroy(cache, h);
373 static int load_block_from_cache(BGZF *fp, int64_t block_address)
377 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
378 k = kh_get(cache, h, block_address);
379 if (k == kh_end(h)) return 0;
381 if (fp->block_length != 0) fp->block_offset = 0;
382 fp->block_address = block_address;
383 fp->block_length = p->size;
384 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
386 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
388 fseeko(fp->file, p->end_offset, SEEK_SET);
393 static void cache_block(BGZF *fp, int size)
398 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
399 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
400 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
401 /* A better way would be to remove the oldest block in the
402 * cache, but here we remove a random one for simplicity. This
403 * should not have a big impact on performance. */
404 for (k = kh_begin(h); k < kh_end(h); ++k)
405 if (kh_exist(h, k)) break;
407 free(kh_val(h, k).block);
411 k = kh_put(cache, h, fp->block_address, &ret);
412 if (ret == 0) return; // if this happens, a bug!
414 p->size = fp->block_length;
415 p->end_offset = fp->block_address + size;
416 p->block = malloc(MAX_BLOCK_SIZE);
417 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
424 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
427 int64_t block_address = knet_tell(fp->x.fpr);
428 if (load_block_from_cache(fp, block_address)) return 0;
429 int count = knet_read(fp->x.fpr, header, sizeof(header));
431 int64_t block_address = ftello(fp->file);
432 if (load_block_from_cache(fp, block_address)) return 0;
433 int count = fread(header, 1, sizeof(header), fp->file);
436 fp->block_length = 0;
440 if (count != sizeof(header)) {
441 report_error(fp, "read failed");
444 if (!check_header(header)) {
445 report_error(fp, "invalid block header");
448 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
449 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
450 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
451 int remaining = block_length - BLOCK_HEADER_LENGTH;
453 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
455 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
457 if (count != remaining) {
458 report_error(fp, "read failed");
462 count = inflate_block(fp, block_length);
466 if (fp->block_length != 0) {
467 // Do not reset offset if this read follows a seek.
468 fp->block_offset = 0;
470 fp->block_address = block_address;
471 fp->block_length = count;
472 cache_block(fp, size);
477 bgzf_read(BGZF* fp, void* data, int length)
482 if (fp->open_mode != 'r') {
483 report_error(fp, "file not open for reading");
488 bgzf_byte_t* output = data;
489 while (bytes_read < length) {
490 int available = fp->block_length - fp->block_offset;
491 if (available <= 0) {
492 if (read_block(fp) != 0) {
495 available = fp->block_length - fp->block_offset;
496 if (available <= 0) {
500 int copy_length = bgzf_min(length-bytes_read, available);
501 bgzf_byte_t* buffer = fp->uncompressed_block;
502 memcpy(output, buffer + fp->block_offset, copy_length);
503 fp->block_offset += copy_length;
504 output += copy_length;
505 bytes_read += copy_length;
507 if (fp->block_offset == fp->block_length) {
509 fp->block_address = knet_tell(fp->x.fpr);
511 fp->block_address = ftello(fp->file);
513 fp->block_offset = 0;
514 fp->block_length = 0;
521 flush_block(BGZF* fp)
523 while (fp->block_offset > 0) {
524 int block_length = deflate_block(fp, fp->block_offset);
525 if (block_length < 0) {
529 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
531 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
533 if (count != block_length) {
534 report_error(fp, "write failed");
537 fp->block_address += block_length;
543 bgzf_write(BGZF* fp, const void* data, int length)
545 if (fp->open_mode != 'w') {
546 report_error(fp, "file not open for writing");
550 if (fp->uncompressed_block == NULL) {
551 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
554 const bgzf_byte_t* input = data;
555 int block_length = fp->uncompressed_block_size;
556 int bytes_written = 0;
557 while (bytes_written < length) {
558 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
559 bgzf_byte_t* buffer = fp->uncompressed_block;
560 memcpy(buffer + fp->block_offset, input, copy_length);
561 fp->block_offset += copy_length;
562 input += copy_length;
563 bytes_written += copy_length;
564 if (fp->block_offset == block_length) {
565 if (flush_block(fp) != 0) {
570 return bytes_written;
576 if (fp->open_mode == 'w') {
577 if (flush_block(fp) != 0) {
580 { // add an empty block
581 int count, block_length = deflate_block(fp, 0);
583 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
585 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
589 if (fflush(fp->x.fpw) != 0) {
591 if (fflush(fp->file) != 0) {
593 report_error(fp, "flush failed");
597 if (fp->owned_file) {
600 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
601 else ret = knet_close(fp->x.fpr);
602 if (ret != 0) return -1;
604 if (fclose(fp->file) != 0) {
609 free(fp->uncompressed_block);
610 free(fp->compressed_block);
619 return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
622 void bgzf_set_cache_size(BGZF *fp, int cache_size)
624 if (fp) fp->cache_size = cache_size;
627 int bgzf_check_EOF(BGZF *fp)
629 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
633 offset = knet_tell(fp->x.fpr);
634 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
635 knet_read(fp->x.fpr, buf, 28);
636 knet_seek(fp->x.fpr, offset, SEEK_SET);
638 offset = ftello(fp->file);
639 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
640 fread(buf, 1, 28, fp->file);
641 fseeko(fp->file, offset, SEEK_SET);
643 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
647 bgzf_seek(BGZF* fp, int64_t pos, int where)
649 if (fp->open_mode != 'r') {
650 report_error(fp, "file not open for read");
653 if (where != SEEK_SET) {
654 report_error(fp, "unimplemented seek option");
657 int block_offset = pos & 0xFFFF;
658 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
660 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
662 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
664 report_error(fp, "seek failed");
667 fp->block_length = 0; // indicates current block is not loaded
668 fp->block_address = block_address;
669 fp->block_offset = block_offset;