3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4 2011 Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 #include <sys/types.h>
36 typedef knetFile *_bgzf_file_t;
37 #define _bgzf_open(fn, mode) knet_open(fn, mode)
38 #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
39 #define _bgzf_close(fp) knet_close(fp)
40 #define _bgzf_fileno(fp) ((fp)->fd)
41 #define _bgzf_tell(fp) knet_tell(fp)
42 #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
43 #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
44 #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
45 #else // ~defined(_USE_KNETFILE)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
49 #else // ~defined(_WIN32)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
52 #endif // ~defined(_WIN32)
53 typedef FILE *_bgzf_file_t;
54 #define _bgzf_open(fn, mode) fopen(fn, mode)
55 #define _bgzf_dopen(fp, mode) fdopen(fp, mode)
56 #define _bgzf_close(fp) fclose(fp)
57 #define _bgzf_fileno(fp) fileno(fp)
58 #define _bgzf_tell(fp) ftello(fp)
59 #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
60 #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
61 #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
62 #endif // ~define(_USE_KNETFILE)
64 #define BLOCK_HEADER_LENGTH 18
65 #define BLOCK_FOOTER_LENGTH 8
68 /* BGZF/GZIP header (speciallized from RFC 1952; little endian):
69 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
70 | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
71 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
73 static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
82 KHASH_MAP_INIT_INT64(cache, cache_t)
85 static inline void packInt16(uint8_t *buffer, uint16_t value)
88 buffer[1] = value >> 8;
91 static inline int unpackInt16(const uint8_t *buffer)
93 return buffer[0] | buffer[1] << 8;
96 static inline void packInt32(uint8_t *buffer, uint32_t value)
99 buffer[1] = value >> 8;
100 buffer[2] = value >> 16;
101 buffer[3] = value >> 24;
104 static BGZF *bgzf_read_init()
107 fp = calloc(1, sizeof(BGZF));
109 fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
110 fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
112 fp->cache = kh_init(cache);
117 static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
120 fp = calloc(1, sizeof(BGZF));
122 fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
123 fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
124 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
125 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
128 // get the compress level from the mode string
129 static int mode2level(const char *__restrict mode)
131 int i, compress_level = -1;
132 for (i = 0; mode[i]; ++i)
133 if (mode[i] >= '0' && mode[i] <= '9') break;
134 if (mode[i]) compress_level = (int)mode[i] - '0';
135 if (strchr(mode, 'u')) compress_level = 0;
136 return compress_level;
139 BGZF *bgzf_open(const char *path, const char *mode)
142 assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
143 if (strchr(mode, 'r') || strchr(mode, 'R')) {
145 if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
146 fp = bgzf_read_init();
148 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
150 if ((fpw = fopen(path, "w")) == 0) return 0;
151 fp = bgzf_write_init(mode2level(mode));
157 BGZF *bgzf_dopen(int fd, const char *mode)
160 assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
161 if (strchr(mode, 'r') || strchr(mode, 'R')) {
163 if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
164 fp = bgzf_read_init();
166 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
168 if ((fpw = fdopen(fd, "w")) == 0) return 0;
169 fp = bgzf_write_init(mode2level(mode));
175 static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
179 uint8_t *dst = (uint8_t*)_dst;
182 zs.zalloc = NULL; zs.zfree = NULL;
185 zs.next_out = dst + BLOCK_HEADER_LENGTH;
186 zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
187 if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
188 if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
189 if (deflateEnd(&zs) != Z_OK) return -1;
190 *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
192 memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
193 packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
195 crc = crc32(crc32(0L, NULL, 0L), src, slen);
196 packInt32((uint8_t*)&dst[*dlen - 8], crc);
197 packInt32((uint8_t*)&dst[*dlen - 4], slen);
201 // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
202 static int deflate_block(BGZF *fp, int block_length)
204 int comp_size = BGZF_MAX_BLOCK_SIZE;
205 if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
206 fp->errcode |= BGZF_ERR_ZLIB;
209 fp->block_offset = 0;
213 // Inflate the block in fp->compressed_block into fp->uncompressed_block
214 static int inflate_block(BGZF* fp, int block_length)
219 zs.next_in = fp->compressed_block + 18;
220 zs.avail_in = block_length - 16;
221 zs.next_out = fp->uncompressed_block;
222 zs.avail_out = BGZF_MAX_BLOCK_SIZE;
224 if (inflateInit2(&zs, -15) != Z_OK) {
225 fp->errcode |= BGZF_ERR_ZLIB;
228 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
230 fp->errcode |= BGZF_ERR_ZLIB;
233 if (inflateEnd(&zs) != Z_OK) {
234 fp->errcode |= BGZF_ERR_ZLIB;
240 static int check_header(const uint8_t *header)
242 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
243 && unpackInt16((uint8_t*)&header[10]) == 6
244 && header[12] == 'B' && header[13] == 'C'
245 && unpackInt16((uint8_t*)&header[14]) == 2);
249 static void free_cache(BGZF *fp)
252 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
253 if (fp->is_write) return;
254 for (k = kh_begin(h); k < kh_end(h); ++k)
255 if (kh_exist(h, k)) free(kh_val(h, k).block);
256 kh_destroy(cache, h);
259 static int load_block_from_cache(BGZF *fp, int64_t block_address)
263 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
264 k = kh_get(cache, h, block_address);
265 if (k == kh_end(h)) return 0;
267 if (fp->block_length != 0) fp->block_offset = 0;
268 fp->block_address = block_address;
269 fp->block_length = p->size;
270 memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
271 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
275 static void cache_block(BGZF *fp, int size)
280 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
281 if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
282 if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) {
283 /* A better way would be to remove the oldest block in the
284 * cache, but here we remove a random one for simplicity. This
285 * should not have a big impact on performance. */
286 for (k = kh_begin(h); k < kh_end(h); ++k)
287 if (kh_exist(h, k)) break;
289 free(kh_val(h, k).block);
293 k = kh_put(cache, h, fp->block_address, &ret);
294 if (ret == 0) return; // if this happens, a bug!
296 p->size = fp->block_length;
297 p->end_offset = fp->block_address + size;
298 p->block = malloc(BGZF_MAX_BLOCK_SIZE);
299 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
302 static void free_cache(BGZF *fp) {}
303 static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
304 static void cache_block(BGZF *fp, int size) {}
307 int bgzf_read_block(BGZF *fp)
309 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
310 int count, size = 0, block_length, remaining;
311 int64_t block_address;
312 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
313 if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
314 count = _bgzf_read(fp->fp, header, sizeof(header));
315 if (count == 0) { // no data read
316 fp->block_length = 0;
319 if (count != sizeof(header) || !check_header(header)) {
320 fp->errcode |= BGZF_ERR_HEADER;
324 block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
325 compressed_block = (uint8_t*)fp->compressed_block;
326 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
327 remaining = block_length - BLOCK_HEADER_LENGTH;
328 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
329 if (count != remaining) {
330 fp->errcode |= BGZF_ERR_IO;
334 if ((count = inflate_block(fp, block_length)) < 0) return -1;
335 if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
336 fp->block_address = block_address;
337 fp->block_length = count;
338 cache_block(fp, size);
342 ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
344 ssize_t bytes_read = 0;
345 uint8_t *output = data;
346 if (length <= 0) return 0;
347 assert(fp->is_write == 0);
348 while (bytes_read < length) {
349 int copy_length, available = fp->block_length - fp->block_offset;
351 if (available <= 0) {
352 if (bgzf_read_block(fp) != 0) return -1;
353 available = fp->block_length - fp->block_offset;
354 if (available <= 0) break;
356 copy_length = length - bytes_read < available? length - bytes_read : available;
357 buffer = fp->uncompressed_block;
358 memcpy(output, buffer + fp->block_offset, copy_length);
359 fp->block_offset += copy_length;
360 output += copy_length;
361 bytes_read += copy_length;
363 if (fp->block_offset == fp->block_length) {
364 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
365 fp->block_offset = fp->block_length = 0;
370 /***** BEGIN: multi-threading *****/
379 typedef struct mtaux_t {
380 int n_threads, n_blks, curr;
388 int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
392 if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
393 mt = calloc(1, sizeof(mtaux_t));
394 mt->n_threads = n_threads;
395 mt->n_blks = n_threads * n_sub_blks;
396 mt->len = calloc(mt->n_blks, sizeof(int));
397 mt->blk = calloc(mt->n_blks, sizeof(void*));
398 for (i = 0; i < mt->n_blks; ++i)
399 mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
400 mt->tid = calloc(mt->n_threads, sizeof(pthread_t));
401 mt->w = calloc(mt->n_threads, sizeof(worker_t));
402 for (i = 0; i < mt->n_threads; ++i) {
406 mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
408 pthread_attr_init(&mt->attr);
409 pthread_attr_setdetachstate(&mt->attr, PTHREAD_CREATE_JOINABLE);
414 static void mt_destroy(mtaux_t *mt)
417 for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
418 for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
419 free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
423 static void *mt_worker(void *data)
426 worker_t *w = (worker_t*)data;
428 for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
429 int clen = BGZF_MAX_BLOCK_SIZE;
430 if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
431 w->errcode |= BGZF_ERR_ZLIB;
432 memcpy(w->mt->blk[i], w->buf, clen);
433 w->mt->len[i] = clen;
438 static void mt_queue(BGZF *fp)
440 mtaux_t *mt = (mtaux_t*)fp->mt;
441 assert(mt->curr < mt->n_blks); // guaranteed by the caller
442 memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
443 mt->len[mt->curr] = fp->block_offset;
444 fp->block_offset = 0;
448 static int mt_flush(BGZF *fp)
451 mtaux_t *mt = (mtaux_t*)fp->mt;
452 if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
453 for (i = 0; i < mt->n_threads; ++i) pthread_create(&mt->tid[i], &mt->attr, mt_worker, &mt->w[i]);
454 for (i = 0; i < mt->n_threads; ++i) {
455 pthread_join(mt->tid[i], 0);
456 fp->errcode |= mt->w[i].errcode;
458 for (i = 0; i < mt->curr; ++i)
459 if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i])
460 fp->errcode |= BGZF_ERR_IO;
465 static int mt_lazy_flush(BGZF *fp)
467 mtaux_t *mt = (mtaux_t*)fp->mt;
469 if (mt->curr == mt->n_blks)
474 static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length)
476 const uint8_t *input = data;
477 ssize_t rest = length;
479 int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest;
480 memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length);
481 fp->block_offset += copy_length; input += copy_length; rest -= copy_length;
482 if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp);
484 return length - rest;
487 /***** END: multi-threading *****/
489 int bgzf_flush(BGZF *fp)
491 if (!fp->is_write) return 0;
492 if (fp->mt) return mt_flush(fp);
493 while (fp->block_offset > 0) {
495 block_length = deflate_block(fp, fp->block_offset);
496 if (block_length < 0) return -1;
497 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
498 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
501 fp->block_address += block_length;
506 int bgzf_flush_try(BGZF *fp, ssize_t size)
508 if (fp->block_offset + size > BGZF_BLOCK_SIZE) {
509 if (fp->mt) return mt_lazy_flush(fp);
510 else return bgzf_flush(fp);
515 ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
517 const uint8_t *input = data;
518 int block_length = BGZF_BLOCK_SIZE, bytes_written = 0;
519 assert(fp->is_write);
520 if (fp->mt) return mt_write(fp, data, length);
521 while (bytes_written < length) {
522 uint8_t* buffer = fp->uncompressed_block;
523 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
524 memcpy(buffer + fp->block_offset, input, copy_length);
525 fp->block_offset += copy_length;
526 input += copy_length;
527 bytes_written += copy_length;
528 if (fp->block_offset == block_length && bgzf_flush(fp)) break;
530 return bytes_written;
533 int bgzf_close(BGZF* fp)
535 int ret, count, block_length;
536 if (fp == 0) return -1;
538 if (bgzf_flush(fp) != 0) return -1;
539 fp->compress_level = -1;
540 block_length = deflate_block(fp, 0); // write an empty block
541 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
542 if (fflush(fp->fp) != 0) {
543 fp->errcode |= BGZF_ERR_IO;
546 if (fp->mt) mt_destroy(fp->mt);
548 ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
549 if (ret != 0) return -1;
550 free(fp->uncompressed_block);
551 free(fp->compressed_block);
557 void bgzf_set_cache_size(BGZF *fp, int cache_size)
559 if (fp) fp->cache_size = cache_size;
562 int bgzf_check_EOF(BGZF *fp)
564 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
567 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
568 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
569 _bgzf_read(fp->fp, buf, 28);
570 _bgzf_seek(fp->fp, offset, SEEK_SET);
571 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
574 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
577 int64_t block_address;
579 if (fp->is_write || where != SEEK_SET) {
580 fp->errcode |= BGZF_ERR_MISUSE;
583 block_offset = pos & 0xFFFF;
584 block_address = pos >> 16;
585 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
586 fp->errcode |= BGZF_ERR_IO;
589 fp->block_length = 0; // indicates current block has not been loaded
590 fp->block_address = block_address;
591 fp->block_offset = block_offset;
595 int bgzf_is_bgzf(const char *fn)
600 if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
601 n = _bgzf_read(fp, buf, 16);
603 if (n != 16) return 0;
604 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
607 int bgzf_getc(BGZF *fp)
610 if (fp->block_offset >= fp->block_length) {
611 if (bgzf_read_block(fp) != 0) return -2; /* error */
612 if (fp->block_length == 0) return -1; /* end-of-file */
614 c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
615 if (fp->block_offset == fp->block_length) {
616 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
617 fp->block_offset = 0;
618 fp->block_length = 0;
624 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
627 int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
630 unsigned char *buf = (unsigned char*)fp->uncompressed_block;
633 if (fp->block_offset >= fp->block_length) {
634 if (bgzf_read_block(fp) != 0) { state = -2; break; }
635 if (fp->block_length == 0) { state = -1; break; }
637 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
638 if (l < fp->block_length) state = 1;
639 l -= fp->block_offset;
640 if (str->l + l + 1 >= str->m) {
641 str->m = str->l + l + 2;
643 str->s = (char*)realloc(str->s, str->m);
645 memcpy(str->s + str->l, buf + fp->block_offset, l);
647 fp->block_offset += l + 1;
648 if (fp->block_offset >= fp->block_length) {
649 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
650 fp->block_offset = 0;
651 fp->block_length = 0;
653 } while (state == 0);
654 if (str->l == 0 && state < 0) return state;