1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 16 August 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(bool writeUncompressed)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
28 , IsWriteUncompressed(writeUncompressed)
\r
30 , UncompressedBlock(NULL)
\r
31 , CompressedBlock(NULL)
\r
34 CompressedBlock = new char[CompressedBlockSize];
\r
35 UncompressedBlock = new char[UncompressedBlockSize];
\r
36 } catch( std::bad_alloc& ba ) {
\r
37 printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");
\r
43 BgzfData::~BgzfData(void) {
\r
44 if( CompressedBlock ) delete[] CompressedBlock;
\r
45 if( UncompressedBlock ) delete[] UncompressedBlock;
\r
49 void BgzfData::Close(void) {
\r
51 // skip if file not open, otherwise set flag
\r
52 if ( !IsOpen ) return;
\r
54 // if writing to file, flush the current BGZF block,
\r
55 // then write an empty block (as EOF marker)
\r
56 if ( IsWriteOnly ) {
\r
58 int blockLength = DeflateBlock();
\r
59 fwrite(CompressedBlock, 1, blockLength, Stream);
\r
68 // compresses the current block
\r
69 int BgzfData::DeflateBlock(void) {
\r
71 // initialize the gzip header
\r
72 char* buffer = CompressedBlock;
\r
73 memset(buffer, 0, 18);
\r
74 buffer[0] = GZIP_ID1;
\r
75 buffer[1] = (char)GZIP_ID2;
\r
76 buffer[2] = CM_DEFLATE;
\r
77 buffer[3] = FLG_FEXTRA;
\r
78 buffer[9] = (char)OS_UNKNOWN;
\r
79 buffer[10] = BGZF_XLEN;
\r
80 buffer[12] = BGZF_ID1;
\r
81 buffer[13] = BGZF_ID2;
\r
82 buffer[14] = BGZF_LEN;
\r
84 // set compression level
\r
85 const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
\r
87 // loop to retry for blocks that do not compress enough
\r
88 int inputLength = BlockOffset;
\r
89 int compressedLength = 0;
\r
90 unsigned int bufferSize = CompressedBlockSize;
\r
94 // initialize zstream values
\r
98 zs.next_in = (Bytef*)UncompressedBlock;
\r
99 zs.avail_in = inputLength;
\r
100 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
101 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
103 // initialize the zlib compression algorithm
\r
104 if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
\r
105 printf("BGZF ERROR: zlib deflate initialization failed.\n");
\r
109 // compress the data
\r
110 int status = deflate(&zs, Z_FINISH);
\r
111 if ( status != Z_STREAM_END ) {
\r
115 // reduce the input length and try again
\r
116 if ( status == Z_OK ) {
\r
117 inputLength -= 1024;
\r
118 if( inputLength < 0 ) {
\r
119 printf("BGZF ERROR: input reduction failed.\n");
\r
125 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
129 // finalize the compression routine
\r
130 if ( deflateEnd(&zs) != Z_OK ) {
\r
131 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
135 compressedLength = zs.total_out;
\r
136 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
137 if ( compressedLength > MAX_BLOCK_SIZE ) {
\r
138 printf("BGZF ERROR: deflate overflow.\n");
\r
145 // store the compressed length
\r
146 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
148 // store the CRC32 checksum
\r
149 unsigned int crc = crc32(0, NULL, 0);
\r
150 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
151 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
152 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
154 // ensure that we have less than a block of data left
\r
155 int remaining = BlockOffset - inputLength;
\r
156 if ( remaining > 0 ) {
\r
157 if ( remaining > inputLength ) {
\r
158 printf("BGZF ERROR: after deflate, remainder too large.\n");
\r
161 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
164 BlockOffset = remaining;
\r
165 return compressedLength;
\r
168 // flushes the data in the BGZF block
\r
169 void BgzfData::FlushBlock(void) {
\r
171 // flush all of the remaining blocks
\r
172 while ( BlockOffset > 0 ) {
\r
174 // compress the data block
\r
175 int blockLength = DeflateBlock();
\r
177 // flush the data to our output stream
\r
178 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
180 if ( numBytesWritten != blockLength ) {
\r
181 printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
185 BlockAddress += blockLength;
\r
189 // de-compresses the current block
\r
190 int BgzfData::InflateBlock(const int& blockLength) {
\r
192 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
196 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
197 zs.avail_in = blockLength - 16;
\r
198 zs.next_out = (Bytef*)UncompressedBlock;
\r
199 zs.avail_out = UncompressedBlockSize;
\r
201 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
202 if ( status != Z_OK ) {
\r
203 printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
\r
207 status = inflate(&zs, Z_FINISH);
\r
208 if ( status != Z_STREAM_END ) {
\r
210 printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
\r
214 status = inflateEnd(&zs);
\r
215 if ( status != Z_OK ) {
\r
216 printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
\r
220 return zs.total_out;
\r
223 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
\r
224 bool BgzfData::Open(const string& filename, const char* mode) {
\r
226 // determine open mode
\r
227 if ( strcmp(mode, "rb") == 0 )
\r
228 IsWriteOnly = false;
\r
229 else if ( strcmp(mode, "wb") == 0)
\r
230 IsWriteOnly = true;
\r
232 printf("BGZF ERROR: unknown file mode: %s\n", mode);
\r
236 // ----------------------------------------------------------------
\r
237 // open Stream to read to/write from file, stdin, or stdout
\r
238 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
\r
240 // read/write BGZF data to/from a file
\r
241 if ( (filename != "stdin") && (filename != "stdout") )
\r
242 Stream = fopen(filename.c_str(), mode);
\r
244 // read BGZF data from stdin
\r
245 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
\r
246 Stream = freopen(NULL, mode, stdin);
\r
248 // write BGZF data to stdout
\r
249 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
\r
250 Stream = freopen(NULL, mode, stdout);
\r
253 printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );
\r
257 // set flag, return success
\r
262 // reads BGZF data into a byte buffer
\r
263 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
265 if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
\r
267 char* output = data;
\r
268 unsigned int numBytesRead = 0;
\r
269 while ( numBytesRead < dataLength ) {
\r
271 int bytesAvailable = BlockLength - BlockOffset;
\r
272 if ( bytesAvailable <= 0 ) {
\r
273 if ( !ReadBlock() ) return -1;
\r
274 bytesAvailable = BlockLength - BlockOffset;
\r
275 if ( bytesAvailable <= 0 ) break;
\r
278 char* buffer = UncompressedBlock;
\r
279 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
280 memcpy(output, buffer + BlockOffset, copyLength);
\r
282 BlockOffset += copyLength;
\r
283 output += copyLength;
\r
284 numBytesRead += copyLength;
\r
287 if ( BlockOffset == BlockLength ) {
\r
288 BlockAddress = ftell64(Stream);
\r
293 return numBytesRead;
\r
296 // reads a BGZF block
\r
297 bool BgzfData::ReadBlock(void) {
\r
299 char header[BLOCK_HEADER_LENGTH];
\r
300 int64_t blockAddress = ftell64(Stream);
\r
302 int count = fread(header, 1, sizeof(header), Stream);
\r
303 if ( count == 0 ) {
\r
308 if ( count != sizeof(header) ) {
\r
309 printf("BGZF ERROR: read block failed - could not read block header\n");
\r
313 if ( !BgzfData::CheckBlockHeader(header) ) {
\r
314 printf("BGZF ERROR: read block failed - invalid block header\n");
\r
318 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
319 char* compressedBlock = CompressedBlock;
\r
320 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
321 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
323 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
324 if ( count != remaining ) {
\r
325 printf("BGZF ERROR: read block failed - could not read data from block\n");
\r
329 count = InflateBlock(blockLength);
\r
330 if ( count < 0 ) {
\r
331 printf("BGZF ERROR: read block failed - could not decompress block data\n");
\r
335 if ( BlockLength != 0 )
\r
338 BlockAddress = blockAddress;
\r
339 BlockLength = count;
\r
343 // seek to position in BGZF file
\r
344 bool BgzfData::Seek(int64_t position) {
\r
346 if ( !IsOpen ) return false;
\r
348 int blockOffset = (position & 0xFFFF);
\r
349 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
351 if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
\r
352 printf("BGZF ERROR: unable to seek in file\n");
\r
357 BlockAddress = blockAddress;
\r
358 BlockOffset = blockOffset;
\r
362 // get file position in BGZF file
\r
363 int64_t BgzfData::Tell(void) {
\r
367 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
370 // writes the supplied data into the BGZF buffer
\r
371 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
373 if ( !IsOpen || !IsWriteOnly ) return false;
\r
376 unsigned int numBytesWritten = 0;
\r
377 const char* input = data;
\r
378 unsigned int blockLength = UncompressedBlockSize;
\r
380 // copy the data to the buffer
\r
381 while ( numBytesWritten < dataLen ) {
\r
383 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
384 char* buffer = UncompressedBlock;
\r
385 memcpy(buffer + BlockOffset, input, copyLength);
\r
387 BlockOffset += copyLength;
\r
388 input += copyLength;
\r
389 numBytesWritten += copyLength;
\r
391 if ( BlockOffset == blockLength )
\r
395 return numBytesWritten;
\r