3 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
4 * This software and its documentation are copyright 2008 by the
5 * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
7 * This software is supplied without any warranty or guaranteed support whatsoever.
8 * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
17 #include <sys/types.h>
25 extern off_t ftello(FILE *stream);
26 extern int fseeko(FILE *stream, off_t offset, int whence);
31 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
32 static const int MAX_BLOCK_SIZE = 64 * 1024;
34 static const int BLOCK_HEADER_LENGTH = 18;
35 static const int BLOCK_FOOTER_LENGTH = 8;
37 static const int GZIP_ID1 = 31;
38 static const int GZIP_ID2 = 139;
39 static const int CM_DEFLATE = 8;
40 static const int FLG_FEXTRA = 4;
41 static const int OS_UNKNOWN = 255;
42 static const int BGZF_ID1 = 66; // 'B'
43 static const int BGZF_ID2 = 67; // 'C'
44 static const int BGZF_LEN = 2;
45 static const int BGZF_XLEN = 6; // BGZF_LEN+4
47 static const int GZIP_WINDOW_BITS = -15; // no zlib header
48 static const int Z_DEFAULT_MEM_LEVEL = 8;
53 packInt16(uint8_t* buffer, uint16_t value)
56 buffer[1] = value >> 8;
61 unpackInt16(const uint8_t* buffer)
63 return (buffer[0] | (buffer[1] << 8));
68 packInt32(uint8_t* buffer, uint32_t value)
71 buffer[1] = value >> 8;
72 buffer[2] = value >> 16;
73 buffer[3] = value >> 24;
80 return (x < y) ? x : y;
85 report_error(BGZF* fp, const char* message) {
93 FILE* file = fdopen(fd, "r");
94 BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
95 fp->file_descriptor = fd;
99 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
100 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
101 fp->compressed_block_size = MAX_BLOCK_SIZE;
102 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
103 fp->block_address = 0;
104 fp->block_offset = 0;
105 fp->block_length = 0;
114 FILE* file = fdopen(fd, "w");
115 BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
116 fp->file_descriptor = fd;
120 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
121 fp->uncompressed_block = NULL;
122 fp->compressed_block_size = MAX_BLOCK_SIZE;
123 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
124 fp->block_address = 0;
125 fp->block_offset = 0;
126 fp->block_length = 0;
132 bgzf_open(const char* __restrict path, const char* __restrict mode)
135 if (strcasecmp(mode, "r") == 0) {
136 int oflag = O_RDONLY;
137 int fd = open(path, oflag);
139 } else if (strcasecmp(mode, "w") == 0) {
140 int oflag = O_WRONLY | O_CREAT | O_TRUNC;
141 int fd = open(path, oflag, 0644);
151 bgzf_fdopen(int fd, const char * __restrict mode)
153 if (strcasecmp(mode, "r") == 0) {
154 return open_read(fd);
155 } else if (strcasecmp(mode, "w") == 0) {
156 return open_write(fd);
164 deflate_block(BGZF* fp, int block_length)
166 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
167 // Also adds an extra field that stores the compressed block length.
169 byte* buffer = (byte*)fp->compressed_block;
170 int buffer_size = fp->compressed_block_size;
173 buffer[0] = GZIP_ID1;
174 buffer[1] = GZIP_ID2;
175 buffer[2] = CM_DEFLATE;
176 buffer[3] = FLG_FEXTRA;
177 buffer[4] = 0; // mtime
182 buffer[9] = OS_UNKNOWN;
183 buffer[10] = BGZF_XLEN;
185 buffer[12] = BGZF_ID1;
186 buffer[13] = BGZF_ID2;
187 buffer[14] = BGZF_LEN;
189 buffer[16] = 0; // placeholder for block length
192 // loop to retry for blocks that do not compress enough
193 int input_length = block_length;
194 int compressed_length = 0;
200 zs.next_in = (Bytef*)fp->uncompressed_block;
201 zs.avail_in = input_length;
202 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
203 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
205 int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
206 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
207 if (status != Z_OK) {
208 report_error(fp, "deflate init failed");
211 status = deflate(&zs, Z_FINISH);
212 if (status != Z_STREAM_END) {
214 if (status == Z_OK) {
215 // Not enough space in buffer.
216 // Can happen in the rare case the input doesn't compress enough.
217 // Reduce the amount of input until it fits.
218 input_length -= 1024;
219 if (input_length <= 0) {
220 // should never happen
221 report_error(fp, "input reduction failed");
226 report_error(fp, "deflate failed");
229 status = deflateEnd(&zs);
230 if (status != Z_OK) {
231 report_error(fp, "deflate end failed");
234 compressed_length = zs.total_out;
235 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
236 if (compressed_length > MAX_BLOCK_SIZE) {
237 // should never happen
238 report_error(fp, "deflate overflow");
244 packInt16((uint8_t*)&buffer[16], compressed_length-1);
245 uint32_t crc = crc32(0L, NULL, 0L);
246 crc = crc32(crc, (Bytef*)fp->uncompressed_block, input_length);
247 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
248 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
250 int remaining = block_length - input_length;
252 if (remaining > input_length) {
253 // should never happen (check so we can use memcpy)
254 report_error(fp, "remainder too large");
257 memcpy(fp->uncompressed_block,
258 (char*)fp->uncompressed_block + input_length,
261 fp->block_offset = remaining;
262 return compressed_length;
267 inflate_block(BGZF* fp, int block_length)
269 // Inflate the block in fp->compressed_block into fp->uncompressed_block
274 zs.next_in = (Bytef*)fp->compressed_block + 18;
275 zs.avail_in = block_length - 16;
276 zs.next_out = (Bytef*)fp->uncompressed_block;
277 zs.avail_out = fp->uncompressed_block_size;
279 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
280 if (status != Z_OK) {
281 report_error(fp, "inflate init failed");
284 status = inflate(&zs, Z_FINISH);
285 if (status != Z_STREAM_END) {
287 report_error(fp, "inflate failed");
290 status = inflateEnd(&zs);
291 if (status != Z_OK) {
292 report_error(fp, "inflate failed");
300 check_header(const byte* header)
302 return (header[0] == GZIP_ID1 &&
303 header[1] == (byte) GZIP_ID2 &&
304 header[2] == Z_DEFLATED &&
305 (header[3] & FLG_FEXTRA) != 0 &&
306 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
307 header[12] == BGZF_ID1 &&
308 header[13] == BGZF_ID2 &&
309 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
316 byte header[BLOCK_HEADER_LENGTH];
317 int64_t block_address = ftello(fp->file);
318 int count = fread(header, 1, sizeof(header), fp->file);
320 fp->block_length = 0;
323 if (count != sizeof(header)) {
324 report_error(fp, "read failed");
327 if (!check_header(header)) {
328 report_error(fp, "invalid block header");
331 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
332 byte* compressed_block = (byte*) fp->compressed_block;
333 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
334 int remaining = block_length - BLOCK_HEADER_LENGTH;
335 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
336 if (count != remaining) {
337 report_error(fp, "read failed");
340 count = inflate_block(fp, block_length);
344 if (fp->block_length != 0) {
345 // Do not reset offset if this read follows a seek.
346 fp->block_offset = 0;
348 fp->block_address = block_address;
349 fp->block_length = count;
354 bgzf_read(BGZF* fp, void* data, int length)
359 if (fp->open_mode != 'r') {
360 report_error(fp, "file not open for reading");
365 byte* output = (byte*)data;
366 while (bytes_read < length) {
367 int available = fp->block_length - fp->block_offset;
368 if (available <= 0) {
369 if (read_block(fp) != 0) {
372 available = fp->block_length - fp->block_offset;
373 if (available <= 0) {
377 int copy_length = min(length-bytes_read, available);
378 byte* buffer = (byte*)fp->uncompressed_block;
379 memcpy(output, buffer + fp->block_offset, copy_length);
380 fp->block_offset += copy_length;
381 output += copy_length;
382 bytes_read += copy_length;
384 if (fp->block_offset == fp->block_length) {
385 fp->block_address = ftello(fp->file);
386 fp->block_offset = 0;
387 fp->block_length = 0;
394 flush_block(BGZF* fp)
396 while (fp->block_offset > 0) {
397 int block_length = deflate_block(fp, fp->block_offset);
398 if (block_length < 0) {
401 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
402 if (count != block_length) {
403 report_error(fp, "write failed");
406 fp->block_address += block_length;
412 bgzf_write(BGZF* fp, const void* data, int length)
414 if (fp->open_mode != 'w') {
415 report_error(fp, "file not open for writing");
419 if (fp->uncompressed_block == NULL) {
420 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
423 const byte* input = (byte*)data;
424 int block_length = fp->uncompressed_block_size;
425 int bytes_written = 0;
426 while (bytes_written < length) {
427 int copy_length = min(block_length - fp->block_offset, length - bytes_written);
428 byte* buffer = (byte*)fp->uncompressed_block;
429 memcpy(buffer + fp->block_offset, input, copy_length);
430 fp->block_offset += copy_length;
431 input += copy_length;
432 bytes_written += copy_length;
433 if (fp->block_offset == block_length) {
434 if (flush_block(fp) != 0) {
439 return bytes_written;
445 if (fp->open_mode == 'w') {
446 if (flush_block(fp) != 0) {
449 if (fflush(fp->file) != 0) {
450 report_error(fp, "flush failed");
454 if (fp->owned_file) {
455 if (fclose(fp->file) != 0) {
459 free(fp->uncompressed_block);
460 free(fp->compressed_block);
468 return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
472 bgzf_seek(BGZF* fp, int64_t pos, int where)
474 if (fp->open_mode != 'r') {
475 report_error(fp, "file not open for read");
478 if (where != SEEK_SET) {
479 report_error(fp, "unimplemented seek option");
482 int block_offset = pos & 0xFFFF;
483 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
484 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
485 report_error(fp, "seek failed");
488 fp->block_length = 0; // indicates current block is not loaded
489 fp->block_address = block_address;
490 fp->block_offset = block_offset;