2 * RAZF : Random Access compressed(Z) File
4 * Release Date: 2008-10-27
6 * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * To compile razf.c, zlib-1.2.3(or greater) is required.
41 static inline uint32_t byte_swap_4(uint32_t v){
42 v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
43 return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
46 static inline uint64_t byte_swap_8(uint64_t v){
47 v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
48 v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
49 return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
52 static inline int is_big_endian(){
55 return (c[0] != 0x01);
58 static void add_zindex(RAZF *rz, int64_t in, int64_t out){
59 if(rz->index->size == rz->index->cap){
60 rz->index->cap = rz->index->cap * 1.5 + 2;
61 rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
62 rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
64 if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
65 rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
69 static void save_zindex(RAZF *rz, int fd){
72 is_be = is_big_endian();
73 if(is_be) write(fd, &rz->index->size, sizeof(int));
75 v32 = byte_swap_4((uint32_t)rz->index->size);
76 write(fd, &v32, sizeof(uint32_t));
78 v32 = rz->index->size / RZ_BIN_SIZE + 1;
80 for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
81 for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
83 write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
84 write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
87 static void load_zindex(RAZF *rz, int fd){
90 if(!rz->load_index) return;
91 if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
92 is_be = is_big_endian();
93 read(fd, &rz->index->size, sizeof(int));
94 if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
95 rz->index->cap = rz->index->size;
96 v32 = rz->index->size / RZ_BIN_SIZE + 1;
97 rz->index->bin_offsets = malloc(sizeof(int64_t) * v32);
98 read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
99 rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
100 read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
102 for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
103 for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
107 static RAZF* razf_open_w(int fd){
109 rz = calloc(1, sizeof(RAZF));
112 rz->stream = calloc(sizeof(z_stream), 1);
113 rz->inbuf = malloc(RZ_BUFFER_SIZE);
114 rz->outbuf = malloc(RZ_BUFFER_SIZE);
115 rz->index = calloc(sizeof(ZBlockIndex), 1);
116 deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
117 rz->stream->avail_out = RZ_BUFFER_SIZE;
118 rz->stream->next_out = rz->outbuf;
119 rz->header = calloc(sizeof(gz_header), 1);
120 rz->header->os = 0x03; //Unix
121 rz->header->text = 0;
122 rz->header->time = 0;
123 rz->header->extra = malloc(7);
124 strncpy((char*)rz->header->extra, "RAZF", 4);
125 rz->header->extra[4] = 1; // obsolete field
126 // block size = RZ_BLOCK_SIZE, Big-Endian
127 rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
128 rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
129 rz->header->extra_len = 7;
130 rz->header->name = rz->header->comment = 0;
131 rz->header->hcrc = 0;
132 deflateSetHeader(rz->stream, rz->header);
133 rz->block_pos = rz->block_off = 0;
137 static void _razf_write(RAZF* rz, const void *data, int size){
139 rz->stream->avail_in = size;
140 rz->stream->next_in = (void*)data;
142 tout = rz->stream->avail_out;
143 deflate(rz->stream, Z_NO_FLUSH);
144 rz->out += tout - rz->stream->avail_out;
145 if(rz->stream->avail_out) break;
146 write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
147 rz->stream->avail_out = RZ_BUFFER_SIZE;
148 rz->stream->next_out = rz->outbuf;
149 if(rz->stream->avail_in == 0) break;
151 rz->in += size - rz->stream->avail_in;
152 rz->block_off += size - rz->stream->avail_in;
155 static void razf_flush(RAZF *rz){
158 _razf_write(rz, rz->inbuf, rz->buf_len);
159 rz->buf_off = rz->buf_len = 0;
161 if(rz->stream->avail_out){
162 write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
163 rz->stream->avail_out = RZ_BUFFER_SIZE;
164 rz->stream->next_out = rz->outbuf;
167 tout = rz->stream->avail_out;
168 deflate(rz->stream, Z_FULL_FLUSH);
169 rz->out += tout - rz->stream->avail_out;
170 if(rz->stream->avail_out == 0){
171 write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
172 rz->stream->avail_out = RZ_BUFFER_SIZE;
173 rz->stream->next_out = rz->outbuf;
176 rz->block_pos = rz->out;
180 static void razf_end_flush(RAZF *rz){
183 _razf_write(rz, rz->inbuf, rz->buf_len);
184 rz->buf_off = rz->buf_len = 0;
187 tout = rz->stream->avail_out;
188 deflate(rz->stream, Z_FINISH);
189 rz->out += tout - rz->stream->avail_out;
190 if(rz->stream->avail_out < RZ_BUFFER_SIZE){
191 write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
192 rz->stream->avail_out = RZ_BUFFER_SIZE;
193 rz->stream->next_out = rz->outbuf;
198 static void _razf_buffered_write(RAZF *rz, const void *data, int size){
201 if(rz->buf_len == RZ_BUFFER_SIZE){
202 _razf_write(rz, rz->inbuf, rz->buf_len);
205 if(size + rz->buf_len < RZ_BUFFER_SIZE){
206 for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
210 n = RZ_BUFFER_SIZE - rz->buf_len;
211 for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
219 int razf_write(RAZF* rz, const void *data, int size){
223 next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
224 while(rz->in + rz->buf_len + size >= next_block){
225 n = next_block - rz->in - rz->buf_len;
226 _razf_buffered_write(rz, data, n);
230 add_zindex(rz, rz->in, rz->out);
231 next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
233 _razf_buffered_write(rz, data, size);
238 #define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
239 #define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
240 #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
241 #define ORIG_NAME 0x08 /* bit 3 set: original file name present */
242 #define COMMENT 0x10 /* bit 4 set: file comment present */
243 #define RESERVED 0xE0 /* bits 5..7: reserved */
245 static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
246 int method, flags, n, len;
247 if(size < 2) return 0;
248 if(data[0] != 0x1f || data[1] != 0x8b) return 0;
249 if(size < 4) return 0;
252 if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
253 n = 4 + 6; // Skip 6 bytes
256 if(flags & EXTRA_FIELD){
257 if(size < n + 2) return 0;
258 len = ((int)data[n + 1] << 8) | data[n];
262 if(n >= size) return 0;
266 *extra_len = n - (*extra_off);
268 if(flags & ORIG_NAME) while(n < size && data[n++]);
269 if(flags & COMMENT) while(n < size && data[n++]);
270 if(flags & HEAD_CRC){
271 if(n + 2 > size) return 0;
277 static RAZF* razf_open_r(int fd, int _load_index){
279 int ext_off, ext_len;
282 unsigned char c[] = "RAZF";
283 rz = calloc(1, sizeof(RAZF));
286 rz->stream = calloc(sizeof(z_stream), 1);
287 rz->inbuf = malloc(RZ_BUFFER_SIZE);
288 rz->outbuf = malloc(RZ_BUFFER_SIZE);
289 rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
290 n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
291 ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
295 rz->file_type = FILE_TYPE_PLAIN;
296 memcpy(rz->outbuf, rz->inbuf, n);
302 rz->header_size = ret;
303 ret = inflateInit2(rz->stream, -WINDOW_BITS);
304 if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
305 rz->stream->avail_in = n - rz->header_size;
306 rz->stream->next_in = rz->inbuf + rz->header_size;
307 rz->stream->avail_out = RZ_BUFFER_SIZE;
308 rz->stream->next_out = rz->outbuf;
309 rz->file_type = FILE_TYPE_GZ;
310 rz->in = rz->header_size;
311 rz->block_pos = rz->header_size;
312 rz->next_block_pos = rz->header_size;
314 if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
315 if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
316 fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
319 rz->load_index = _load_index;
320 rz->file_type = FILE_TYPE_RZ;
321 if(lseek(fd, -16, SEEK_END) == -1){
325 rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
327 is_be = is_big_endian();
329 read(fd, &end, sizeof(int64_t));
330 if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
331 else rz->src_end = end;
332 read(fd, &end, sizeof(int64_t));
333 if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
336 rz->stream->avail_in -= n - rz->end;
339 if(rz->end > rz->src_end){
340 lseek(fd, rz->in, SEEK_SET);
343 if(lseek(fd, rz->end, SEEK_SET) != rz->end){
344 lseek(fd, rz->in, SEEK_SET);
348 lseek(fd, n, SEEK_SET);
353 RAZF* razf_dopen(int fd, const char *mode){
354 if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1);
355 else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
359 RAZF* razf_dopen2(int fd, const char *mode)
361 if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0);
362 else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
366 static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
369 if(strcasecmp(mode, "r") == 0){
370 fd = open(filename, O_RDONLY);
371 rz = razf_open_r(fd, _load_index);
372 } else if(strcasecmp(mode, "w") == 0){
373 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
374 rz = razf_open_w(fd);
379 RAZF* razf_open(const char *filename, const char *mode){
380 return _razf_open(filename, mode, 1);
383 RAZF* razf_open2(const char *filename, const char *mode){
384 return _razf_open(filename, mode, 0);
387 int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
389 if(rz->mode != 'r' && rz->mode != 'R') return 0;
390 switch(rz->file_type){
391 case FILE_TYPE_PLAIN:
392 if(rz->end == 0x7fffffffffffffffLL){
393 if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
394 rz->end = lseek(rz->filedes, 0, SEEK_END);
395 lseek(rz->filedes, n, SEEK_SET);
397 *u_size = *c_size = rz->end;
402 if(rz->src_end == rz->end) return 0;
403 *u_size = rz->src_end;
411 static int _razf_read(RAZF* rz, void *data, int size){
413 if(rz->z_eof || rz->z_err) return 0;
414 if (rz->file_type == FILE_TYPE_PLAIN) {
415 ret = read(rz->filedes, data, size);
416 if (ret == 0) rz->z_eof = 1;
419 rz->stream->avail_out = size;
420 rz->stream->next_out = data;
421 while(rz->stream->avail_out){
422 if(rz->stream->avail_in == 0){
423 if(rz->in >= rz->end){ rz->z_eof = 1; break; }
424 if(rz->end - rz->in < RZ_BUFFER_SIZE){
425 rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
427 rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
429 if(rz->stream->avail_in == 0){
433 rz->stream->next_in = rz->inbuf;
435 tin = rz->stream->avail_in;
436 ret = inflate(rz->stream, Z_BLOCK);
437 rz->in += tin - rz->stream->avail_in;
438 if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
439 fprintf(stderr, "[_razf_read] inflate error: %d (at %s:%d)\n", ret, __FILE__, __LINE__);
443 if(ret == Z_STREAM_END){
447 if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
449 rz->next_block_pos = rz->in;
453 return size - rz->stream->avail_out;
456 int razf_read(RAZF *rz, void *data, int size){
461 if(size < rz->buf_len){
462 for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
466 rz->block_off += size;
470 for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
473 rz->block_off += rz->buf_len;
477 rz->block_pos = rz->next_block_pos;
482 } else if(rz->buf_flush){
483 rz->block_pos = rz->next_block_pos;
487 if(rz->buf_flush) continue;
488 rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
489 if(rz->z_eof && rz->buf_len == 0) break;
491 rz->out += ori_size - size;
492 return ori_size - size;
495 int razf_skip(RAZF* rz, int size){
500 if(size < rz->buf_len){
503 rz->block_off += size;
510 rz->block_off += rz->buf_len;
512 rz->block_pos = rz->next_block_pos;
517 } else if(rz->buf_flush){
518 rz->block_pos = rz->next_block_pos;
522 if(rz->buf_flush) continue;
523 rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
526 rz->out += ori_size - size;
527 return ori_size - size;
530 static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
531 lseek(rz->filedes, in, SEEK_SET);
535 rz->next_block_pos = in;
538 rz->z_eof = rz->z_err = 0;
539 inflateReset(rz->stream);
540 rz->stream->avail_in = 0;
541 rz->buf_off = rz->buf_len = 0;
544 int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
547 if(rz->file_type == FILE_TYPE_PLAIN){
548 rz->buf_off = rz->buf_len = 0;
549 pos = block_start + block_offset;
550 pos = lseek(rz->filedes, pos, SEEK_SET);
551 rz->out = rz->in = pos;
554 if(block_start == rz->block_pos && block_offset >= rz->block_off) {
555 block_offset -= rz->block_off;
556 goto SKIP; // Needn't reset inflate
558 if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
559 _razf_reset_read(rz, block_start, 0);
561 if(block_offset) razf_skip(rz, block_offset);
562 return rz->block_off;
565 int64_t razf_seek(RAZF* rz, int64_t pos, int where){
567 int64_t seek_pos, new_out;
569 if (where == SEEK_CUR) pos += rz->out;
570 else if (where == SEEK_END) pos += rz->src_end;
571 if(rz->file_type == FILE_TYPE_PLAIN){
572 seek_pos = lseek(rz->filedes, pos, SEEK_SET);
573 rz->buf_off = rz->buf_len = 0;
574 rz->out = rz->in = seek_pos;
576 } else if(rz->file_type == FILE_TYPE_GZ){
577 if(pos >= rz->out) goto SKIP;
580 if(pos == rz->out) return pos;
581 if(pos > rz->src_end) return rz->out;
582 if(!rz->seekable || !rz->load_index){
583 if(pos >= rz->out) goto SKIP;
585 idx = pos / RZ_BLOCK_SIZE - 1;
586 seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
587 new_out = (idx + 1) * RZ_BLOCK_SIZE;
588 if(pos > rz->out && new_out <= rz->out) goto SKIP;
589 _razf_reset_read(rz, seek_pos, new_out);
591 razf_skip(rz, (int)(pos - rz->out));
595 uint64_t razf_tell2(RAZF *rz)
598 if (rz->load_index) {
599 int64_t idx, seek_pos;
600 idx = rz->out / RZ_BLOCK_SIZE - 1;
601 seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
602 if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
603 fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
604 (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
607 return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
610 int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
612 if (where != SEEK_SET) return -1;
613 return razf_jump(rz, voffset>>16, voffset&0xffff);
616 void razf_close(RAZF *rz){
620 deflateEnd(rz->stream);
621 save_zindex(rz, rz->filedes);
623 write(rz->filedes, &rz->in, sizeof(int64_t));
624 write(rz->filedes, &rz->out, sizeof(int64_t));
626 v64 = byte_swap_8((uint64_t)rz->in);
627 write(rz->filedes, &v64, sizeof(int64_t));
628 v64 = byte_swap_8((uint64_t)rz->out);
629 write(rz->filedes, &v64, sizeof(int64_t));
631 } else if(rz->mode == 'r'){
632 if(rz->stream) inflateEnd(rz->stream);
634 if(rz->inbuf) free(rz->inbuf);
635 if(rz->outbuf) free(rz->outbuf);
637 free(rz->header->extra);
638 free(rz->header->name);
639 free(rz->header->comment);
643 free(rz->index->bin_offsets);
644 free(rz->index->cell_offsets);