3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 static BGZF *bgzf_read_init()
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
123 fp->cache = kh_init(cache);
132 knetFile *file = knet_dopen(fd, "r");
134 FILE* file = fdopen(fd, "r");
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
151 open_write(int fd, bool is_uncompressed)
153 FILE* file = fdopen(fd, "w");
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
159 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
165 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
166 fp->uncompressed_block = NULL;
167 fp->compressed_block_size = MAX_BLOCK_SIZE;
168 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
169 fp->block_address = 0;
170 fp->block_offset = 0;
171 fp->block_length = 0;
177 bgzf_open(const char* __restrict path, const char* __restrict mode)
180 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
182 knetFile *file = knet_open(path, mode);
183 if (file == 0) return 0;
184 fp = bgzf_read_init();
185 fp->file_descriptor = -1;
189 int fd, oflag = O_RDONLY;
193 fd = open(path, oflag);
194 if (fd == -1) return 0;
197 } else if (mode[0] == 'w' || mode[0] == 'W') {
198 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
202 fd = open(path, oflag, 0666);
203 if (fd == -1) return 0;
204 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
206 if (fp != NULL) fp->owned_file = 1;
211 bgzf_fdopen(int fd, const char * __restrict mode)
213 if (fd == -1) return 0;
214 if (mode[0] == 'r' || mode[0] == 'R') {
215 return open_read(fd);
216 } else if (mode[0] == 'w' || mode[0] == 'W') {
217 return open_write(fd, strstr(mode, "u")? 1 : 0);
225 deflate_block(BGZF* fp, int block_length)
227 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
228 // Also adds an extra field that stores the compressed block length.
230 bgzf_byte_t* buffer = fp->compressed_block;
231 int buffer_size = fp->compressed_block_size;
234 buffer[0] = GZIP_ID1;
235 buffer[1] = GZIP_ID2;
236 buffer[2] = CM_DEFLATE;
237 buffer[3] = FLG_FEXTRA;
238 buffer[4] = 0; // mtime
243 buffer[9] = OS_UNKNOWN;
244 buffer[10] = BGZF_XLEN;
246 buffer[12] = BGZF_ID1;
247 buffer[13] = BGZF_ID2;
248 buffer[14] = BGZF_LEN;
250 buffer[16] = 0; // placeholder for block length
253 // loop to retry for blocks that do not compress enough
254 int input_length = block_length;
255 int compressed_length = 0;
257 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
261 zs.next_in = fp->uncompressed_block;
262 zs.avail_in = input_length;
263 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
264 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
266 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
267 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
268 if (status != Z_OK) {
269 report_error(fp, "deflate init failed");
272 status = deflate(&zs, Z_FINISH);
273 if (status != Z_STREAM_END) {
275 if (status == Z_OK) {
276 // Not enough space in buffer.
277 // Can happen in the rare case the input doesn't compress enough.
278 // Reduce the amount of input until it fits.
279 input_length -= 1024;
280 if (input_length <= 0) {
281 // should never happen
282 report_error(fp, "input reduction failed");
287 report_error(fp, "deflate failed");
290 status = deflateEnd(&zs);
291 if (status != Z_OK) {
292 report_error(fp, "deflate end failed");
295 compressed_length = zs.total_out;
296 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
297 if (compressed_length > MAX_BLOCK_SIZE) {
298 // should never happen
299 report_error(fp, "deflate overflow");
305 packInt16((uint8_t*)&buffer[16], compressed_length-1);
306 uint32_t crc = crc32(0L, NULL, 0L);
307 crc = crc32(crc, fp->uncompressed_block, input_length);
308 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
309 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
311 int remaining = block_length - input_length;
313 if (remaining > input_length) {
314 // should never happen (check so we can use memcpy)
315 report_error(fp, "remainder too large");
318 memcpy(fp->uncompressed_block,
319 fp->uncompressed_block + input_length,
322 fp->block_offset = remaining;
323 return compressed_length;
328 inflate_block(BGZF* fp, int block_length)
330 // Inflate the block in fp->compressed_block into fp->uncompressed_block
335 zs.next_in = fp->compressed_block + 18;
336 zs.avail_in = block_length - 16;
337 zs.next_out = fp->uncompressed_block;
338 zs.avail_out = fp->uncompressed_block_size;
340 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
341 if (status != Z_OK) {
342 report_error(fp, "inflate init failed");
345 status = inflate(&zs, Z_FINISH);
346 if (status != Z_STREAM_END) {
348 report_error(fp, "inflate failed");
351 status = inflateEnd(&zs);
352 if (status != Z_OK) {
353 report_error(fp, "inflate failed");
361 check_header(const bgzf_byte_t* header)
363 return (header[0] == GZIP_ID1 &&
364 header[1] == (bgzf_byte_t) GZIP_ID2 &&
365 header[2] == Z_DEFLATED &&
366 (header[3] & FLG_FEXTRA) != 0 &&
367 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
368 header[12] == BGZF_ID1 &&
369 header[13] == BGZF_ID2 &&
370 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
373 static void free_cache(BGZF *fp)
376 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
377 if (fp->open_mode != 'r') return;
378 for (k = kh_begin(h); k < kh_end(h); ++k)
379 if (kh_exist(h, k)) free(kh_val(h, k).block);
380 kh_destroy(cache, h);
383 static int load_block_from_cache(BGZF *fp, int64_t block_address)
387 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
388 k = kh_get(cache, h, block_address);
389 if (k == kh_end(h)) return 0;
391 if (fp->block_length != 0) fp->block_offset = 0;
392 fp->block_address = block_address;
393 fp->block_length = p->size;
394 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
396 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
398 fseeko(fp->file, p->end_offset, SEEK_SET);
403 static void cache_block(BGZF *fp, int size)
408 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
409 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
410 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
411 /* A better way would be to remove the oldest block in the
412 * cache, but here we remove a random one for simplicity. This
413 * should not have a big impact on performance. */
414 for (k = kh_begin(h); k < kh_end(h); ++k)
415 if (kh_exist(h, k)) break;
417 free(kh_val(h, k).block);
421 k = kh_put(cache, h, fp->block_address, &ret);
422 if (ret == 0) return; // if this happens, a bug!
424 p->size = fp->block_length;
425 p->end_offset = fp->block_address + size;
426 p->block = malloc(MAX_BLOCK_SIZE);
427 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
431 bgzf_read_block(BGZF* fp)
433 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
436 int64_t block_address = knet_tell(fp->x.fpr);
437 if (load_block_from_cache(fp, block_address)) return 0;
438 count = knet_read(fp->x.fpr, header, sizeof(header));
440 int64_t block_address = ftello(fp->file);
441 if (load_block_from_cache(fp, block_address)) return 0;
442 count = fread(header, 1, sizeof(header), fp->file);
445 fp->block_length = 0;
449 if (count != sizeof(header)) {
450 report_error(fp, "read failed");
453 if (!check_header(header)) {
454 report_error(fp, "invalid block header");
457 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
458 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
459 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
460 int remaining = block_length - BLOCK_HEADER_LENGTH;
462 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
464 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
466 if (count != remaining) {
467 report_error(fp, "read failed");
471 count = inflate_block(fp, block_length);
472 if (count < 0) return -1;
473 if (fp->block_length != 0) {
474 // Do not reset offset if this read follows a seek.
475 fp->block_offset = 0;
477 fp->block_address = block_address;
478 fp->block_length = count;
479 cache_block(fp, size);
484 bgzf_read(BGZF* fp, void* data, int length)
489 if (fp->open_mode != 'r') {
490 report_error(fp, "file not open for reading");
495 bgzf_byte_t* output = data;
496 while (bytes_read < length) {
497 int available = fp->block_length - fp->block_offset;
498 if (available <= 0) {
499 if (bgzf_read_block(fp) != 0) {
502 available = fp->block_length - fp->block_offset;
503 if (available <= 0) {
507 int copy_length = bgzf_min(length-bytes_read, available);
508 bgzf_byte_t* buffer = fp->uncompressed_block;
509 memcpy(output, buffer + fp->block_offset, copy_length);
510 fp->block_offset += copy_length;
511 output += copy_length;
512 bytes_read += copy_length;
514 if (fp->block_offset == fp->block_length) {
516 fp->block_address = knet_tell(fp->x.fpr);
518 fp->block_address = ftello(fp->file);
520 fp->block_offset = 0;
521 fp->block_length = 0;
526 int bgzf_flush(BGZF* fp)
528 while (fp->block_offset > 0) {
529 int count, block_length;
530 block_length = deflate_block(fp, fp->block_offset);
531 if (block_length < 0) return -1;
533 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
535 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
537 if (count != block_length) {
538 report_error(fp, "write failed");
541 fp->block_address += block_length;
546 int bgzf_flush_try(BGZF *fp, int size)
548 if (fp->block_offset + size > fp->uncompressed_block_size)
549 return bgzf_flush(fp);
553 int bgzf_write(BGZF* fp, const void* data, int length)
555 if (fp->open_mode != 'w') {
556 report_error(fp, "file not open for writing");
560 if (fp->uncompressed_block == NULL)
561 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
563 const bgzf_byte_t* input = data;
564 int block_length = fp->uncompressed_block_size;
565 int bytes_written = 0;
566 while (bytes_written < length) {
567 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
568 bgzf_byte_t* buffer = fp->uncompressed_block;
569 memcpy(buffer + fp->block_offset, input, copy_length);
570 fp->block_offset += copy_length;
571 input += copy_length;
572 bytes_written += copy_length;
573 if (fp->block_offset == block_length) {
574 if (bgzf_flush(fp) != 0) {
579 return bytes_written;
582 int bgzf_close(BGZF* fp)
584 if (fp->open_mode == 'w') {
585 if (bgzf_flush(fp) != 0) return -1;
586 { // add an empty block
587 int count, block_length = deflate_block(fp, 0);
589 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
591 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
595 if (fflush(fp->x.fpw) != 0) {
597 if (fflush(fp->file) != 0) {
599 report_error(fp, "flush failed");
603 if (fp->owned_file) {
606 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
607 else ret = knet_close(fp->x.fpr);
608 if (ret != 0) return -1;
610 if (fclose(fp->file) != 0) return -1;
613 free(fp->uncompressed_block);
614 free(fp->compressed_block);
620 void bgzf_set_cache_size(BGZF *fp, int cache_size)
622 if (fp) fp->cache_size = cache_size;
625 int bgzf_check_EOF(BGZF *fp)
627 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
631 offset = knet_tell(fp->x.fpr);
632 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
633 knet_read(fp->x.fpr, buf, 28);
634 knet_seek(fp->x.fpr, offset, SEEK_SET);
636 offset = ftello(fp->file);
637 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
638 fread(buf, 1, 28, fp->file);
639 fseeko(fp->file, offset, SEEK_SET);
641 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
644 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
647 int64_t block_address;
649 if (fp->open_mode != 'r') {
650 report_error(fp, "file not open for read");
653 if (where != SEEK_SET) {
654 report_error(fp, "unimplemented seek option");
657 block_offset = pos & 0xFFFF;
658 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
660 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
662 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
664 report_error(fp, "seek failed");
667 fp->block_length = 0; // indicates current block is not loaded
668 fp->block_address = block_address;
669 fp->block_offset = block_offset;