3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 int bgzf_check_bgzf(const char *fn)
117 uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
120 if ((fp = bgzf_open(fn, "r")) == 0)
122 fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
127 n = knet_read(fp->x.fpr, buf, 10);
129 n = fread(buf, 1, 10, fp->file);
136 if ( !memcmp(magic, buf, 10) ) return 1;
140 static BGZF *bgzf_read_init()
143 fp = calloc(1, sizeof(BGZF));
144 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
145 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
146 fp->compressed_block_size = MAX_BLOCK_SIZE;
147 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
149 fp->cache = kh_init(cache);
158 knetFile *file = knet_dopen(fd, "r");
160 FILE* file = fdopen(fd, "r");
163 if (file == 0) return 0;
164 fp = bgzf_read_init();
165 fp->file_descriptor = fd;
177 open_write(int fd, int compress_level) // compress_level==-1 for the default level
179 FILE* file = fdopen(fd, "w");
181 if (file == 0) return 0;
182 fp = malloc(sizeof(BGZF));
183 fp->file_descriptor = fd;
186 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
187 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
193 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
194 fp->uncompressed_block = NULL;
195 fp->compressed_block_size = MAX_BLOCK_SIZE;
196 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
197 fp->block_address = 0;
198 fp->block_offset = 0;
199 fp->block_length = 0;
205 bgzf_open(const char* __restrict path, const char* __restrict mode)
208 if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
210 knetFile *file = knet_open(path, mode);
211 if (file == 0) return 0;
212 fp = bgzf_read_init();
213 fp->file_descriptor = -1;
217 int fd, oflag = O_RDONLY;
221 fd = open(path, oflag);
222 if (fd == -1) return 0;
225 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
226 int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
230 fd = open(path, oflag, 0666);
231 if (fd == -1) return 0;
232 { // set compress_level
234 for (i = 0; mode[i]; ++i)
235 if (mode[i] >= '0' && mode[i] <= '9') break;
236 if (mode[i]) compress_level = (int)mode[i] - '0';
237 if (strchr(mode, 'u')) compress_level = 0;
239 fp = open_write(fd, compress_level);
241 if (fp != NULL) fp->owned_file = 1;
246 bgzf_fdopen(int fd, const char * __restrict mode)
248 if (fd == -1) return 0;
249 if (mode[0] == 'r' || mode[0] == 'R') {
250 return open_read(fd);
251 } else if (mode[0] == 'w' || mode[0] == 'W') {
252 int i, compress_level = -1;
253 for (i = 0; mode[i]; ++i)
254 if (mode[i] >= '0' && mode[i] <= '9') break;
255 if (mode[i]) compress_level = (int)mode[i] - '0';
256 if (strchr(mode, 'u')) compress_level = 0;
257 return open_write(fd, compress_level);
265 deflate_block(BGZF* fp, int block_length)
267 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
268 // Also adds an extra field that stores the compressed block length.
270 bgzf_byte_t* buffer = fp->compressed_block;
271 int buffer_size = fp->compressed_block_size;
274 buffer[0] = GZIP_ID1;
275 buffer[1] = GZIP_ID2;
276 buffer[2] = CM_DEFLATE;
277 buffer[3] = FLG_FEXTRA;
278 buffer[4] = 0; // mtime
283 buffer[9] = OS_UNKNOWN;
284 buffer[10] = BGZF_XLEN;
286 buffer[12] = BGZF_ID1;
287 buffer[13] = BGZF_ID2;
288 buffer[14] = BGZF_LEN;
290 buffer[16] = 0; // placeholder for block length
293 // loop to retry for blocks that do not compress enough
294 int input_length = block_length;
295 int compressed_length = 0;
300 zs.next_in = fp->uncompressed_block;
301 zs.avail_in = input_length;
302 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
303 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
305 int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
306 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
307 if (status != Z_OK) {
308 report_error(fp, "deflate init failed");
311 status = deflate(&zs, Z_FINISH);
312 if (status != Z_STREAM_END) {
314 if (status == Z_OK) {
315 // Not enough space in buffer.
316 // Can happen in the rare case the input doesn't compress enough.
317 // Reduce the amount of input until it fits.
318 input_length -= 1024;
319 if (input_length <= 0) {
320 // should never happen
321 report_error(fp, "input reduction failed");
326 report_error(fp, "deflate failed");
329 status = deflateEnd(&zs);
330 if (status != Z_OK) {
331 report_error(fp, "deflate end failed");
334 compressed_length = zs.total_out;
335 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
336 if (compressed_length > MAX_BLOCK_SIZE) {
337 // should never happen
338 report_error(fp, "deflate overflow");
344 packInt16((uint8_t*)&buffer[16], compressed_length-1);
345 uint32_t crc = crc32(0L, NULL, 0L);
346 crc = crc32(crc, fp->uncompressed_block, input_length);
347 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
348 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
350 int remaining = block_length - input_length;
352 if (remaining > input_length) {
353 // should never happen (check so we can use memcpy)
354 report_error(fp, "remainder too large");
357 memcpy(fp->uncompressed_block,
358 fp->uncompressed_block + input_length,
361 fp->block_offset = remaining;
362 return compressed_length;
367 inflate_block(BGZF* fp, int block_length)
369 // Inflate the block in fp->compressed_block into fp->uncompressed_block
375 zs.next_in = fp->compressed_block + 18;
376 zs.avail_in = block_length - 16;
377 zs.next_out = fp->uncompressed_block;
378 zs.avail_out = fp->uncompressed_block_size;
380 status = inflateInit2(&zs, GZIP_WINDOW_BITS);
381 if (status != Z_OK) {
382 report_error(fp, "inflate init failed");
385 status = inflate(&zs, Z_FINISH);
386 if (status != Z_STREAM_END) {
388 report_error(fp, "inflate failed");
391 status = inflateEnd(&zs);
392 if (status != Z_OK) {
393 report_error(fp, "inflate failed");
401 check_header(const bgzf_byte_t* header)
403 return (header[0] == GZIP_ID1 &&
404 header[1] == (bgzf_byte_t) GZIP_ID2 &&
405 header[2] == Z_DEFLATED &&
406 (header[3] & FLG_FEXTRA) != 0 &&
407 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
408 header[12] == BGZF_ID1 &&
409 header[13] == BGZF_ID2 &&
410 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
413 static void free_cache(BGZF *fp)
416 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
417 if (fp->open_mode != 'r') return;
418 for (k = kh_begin(h); k < kh_end(h); ++k)
419 if (kh_exist(h, k)) free(kh_val(h, k).block);
420 kh_destroy(cache, h);
423 static int load_block_from_cache(BGZF *fp, int64_t block_address)
427 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
428 k = kh_get(cache, h, block_address);
429 if (k == kh_end(h)) return 0;
431 if (fp->block_length != 0) fp->block_offset = 0;
432 fp->block_address = block_address;
433 fp->block_length = p->size;
434 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
436 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
438 fseeko(fp->file, p->end_offset, SEEK_SET);
443 static void cache_block(BGZF *fp, int size)
448 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
449 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
450 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
451 /* A better way would be to remove the oldest block in the
452 * cache, but here we remove a random one for simplicity. This
453 * should not have a big impact on performance. */
454 for (k = kh_begin(h); k < kh_end(h); ++k)
455 if (kh_exist(h, k)) break;
457 free(kh_val(h, k).block);
461 k = kh_put(cache, h, fp->block_address, &ret);
462 if (ret == 0) return; // if this happens, a bug!
464 p->size = fp->block_length;
465 p->end_offset = fp->block_address + size;
466 p->block = malloc(MAX_BLOCK_SIZE);
467 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
471 bgzf_read_block(BGZF* fp)
473 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
474 int count, size = 0, block_length, remaining;
476 int64_t block_address = knet_tell(fp->x.fpr);
477 if (load_block_from_cache(fp, block_address)) return 0;
478 count = knet_read(fp->x.fpr, header, sizeof(header));
480 int64_t block_address = ftello(fp->file);
481 if (load_block_from_cache(fp, block_address)) return 0;
482 count = fread(header, 1, sizeof(header), fp->file);
485 fp->block_length = 0;
489 if (count != sizeof(header)) {
490 report_error(fp, "read failed");
493 if (!check_header(header)) {
494 report_error(fp, "invalid block header");
497 block_length = unpackInt16((uint8_t*)&header[16]) + 1;
498 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
499 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
500 remaining = block_length - BLOCK_HEADER_LENGTH;
502 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
504 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
506 if (count != remaining) {
507 report_error(fp, "read failed");
511 count = inflate_block(fp, block_length);
512 if (count < 0) return -1;
513 if (fp->block_length != 0) {
514 // Do not reset offset if this read follows a seek.
515 fp->block_offset = 0;
517 fp->block_address = block_address;
518 fp->block_length = count;
519 cache_block(fp, size);
524 bgzf_read(BGZF* fp, void* data, int length)
529 if (fp->open_mode != 'r') {
530 report_error(fp, "file not open for reading");
535 bgzf_byte_t* output = data;
536 while (bytes_read < length) {
537 int copy_length, available = fp->block_length - fp->block_offset;
539 if (available <= 0) {
540 if (bgzf_read_block(fp) != 0) {
543 available = fp->block_length - fp->block_offset;
544 if (available <= 0) {
548 copy_length = bgzf_min(length-bytes_read, available);
549 buffer = fp->uncompressed_block;
550 memcpy(output, buffer + fp->block_offset, copy_length);
551 fp->block_offset += copy_length;
552 output += copy_length;
553 bytes_read += copy_length;
555 if (fp->block_offset == fp->block_length) {
557 fp->block_address = knet_tell(fp->x.fpr);
559 fp->block_address = ftello(fp->file);
561 fp->block_offset = 0;
562 fp->block_length = 0;
567 int bgzf_flush(BGZF* fp)
569 while (fp->block_offset > 0) {
570 int count, block_length;
571 block_length = deflate_block(fp, fp->block_offset);
572 if (block_length < 0) return -1;
574 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
576 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
578 if (count != block_length) {
579 report_error(fp, "write failed");
582 fp->block_address += block_length;
587 int bgzf_flush_try(BGZF *fp, int size)
589 if (fp->block_offset + size > fp->uncompressed_block_size)
590 return bgzf_flush(fp);
594 int bgzf_write(BGZF* fp, const void* data, int length)
596 const bgzf_byte_t *input = data;
597 int block_length, bytes_written;
598 if (fp->open_mode != 'w') {
599 report_error(fp, "file not open for writing");
603 if (fp->uncompressed_block == NULL)
604 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
607 block_length = fp->uncompressed_block_size;
609 while (bytes_written < length) {
610 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
611 bgzf_byte_t* buffer = fp->uncompressed_block;
612 memcpy(buffer + fp->block_offset, input, copy_length);
613 fp->block_offset += copy_length;
614 input += copy_length;
615 bytes_written += copy_length;
616 if (fp->block_offset == block_length) {
617 if (bgzf_flush(fp) != 0) {
622 return bytes_written;
625 int bgzf_close(BGZF* fp)
627 if (fp->open_mode == 'w') {
628 if (bgzf_flush(fp) != 0) return -1;
629 { // add an empty block
630 int count, block_length = deflate_block(fp, 0);
632 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
634 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
638 if (fflush(fp->x.fpw) != 0) {
640 if (fflush(fp->file) != 0) {
642 report_error(fp, "flush failed");
646 if (fp->owned_file) {
649 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
650 else ret = knet_close(fp->x.fpr);
651 if (ret != 0) return -1;
653 if (fclose(fp->file) != 0) return -1;
656 free(fp->uncompressed_block);
657 free(fp->compressed_block);
663 void bgzf_set_cache_size(BGZF *fp, int cache_size)
665 if (fp) fp->cache_size = cache_size;
668 int bgzf_check_EOF(BGZF *fp)
670 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
674 offset = knet_tell(fp->x.fpr);
675 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
676 knet_read(fp->x.fpr, buf, 28);
677 knet_seek(fp->x.fpr, offset, SEEK_SET);
679 offset = ftello(fp->file);
680 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
681 fread(buf, 1, 28, fp->file);
682 fseeko(fp->file, offset, SEEK_SET);
684 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
687 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
690 int64_t block_address;
692 if (fp->open_mode != 'r') {
693 report_error(fp, "file not open for read");
696 if (where != SEEK_SET) {
697 report_error(fp, "unimplemented seek option");
700 block_offset = pos & 0xFFFF;
701 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
703 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
705 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
707 report_error(fp, "seek failed");
710 fp->block_length = 0; // indicates current block is not loaded
711 fp->block_address = block_address;
712 fp->block_offset = block_offset;