1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 6 October 2011(DB)
6 // ---------------------------------------------------------------------------
7 // Based on BGZF routines developed at the Broad Institute.
8 // Provides the basic functionality for reading & writing BGZF files
9 // Replaces the old BGZF.* files to avoid clashing with other toolkits
10 // ***************************************************************************
12 #include <api/internal/BamException_p.h>
13 #include <api/internal/BgzfStream_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 // ----------------------------
23 // RaiiWrapper implementation
24 // ----------------------------
26 BgzfStream::RaiiWrapper::RaiiWrapper(void)
29 CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
30 UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
33 BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
36 delete[] CompressedBlock;
37 delete[] UncompressedBlock;
39 UncompressedBlock = 0;
48 // ---------------------------
49 // BgzfStream implementation
50 // ---------------------------
53 BgzfStream::BgzfStream(void)
59 , IsWriteCompressed(true)
63 BgzfStream::~BgzfStream(void) {
67 // checks BGZF block header
68 bool BgzfStream::CheckBlockHeader(char* header) {
69 return (header[0] == Constants::GZIP_ID1 &&
70 header[1] == Constants::GZIP_ID2 &&
71 header[2] == Z_DEFLATED &&
72 (header[3] & Constants::FLG_FEXTRA) != 0 &&
73 BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
74 header[12] == Constants::BGZF_ID1 &&
75 header[13] == Constants::BGZF_ID2 &&
76 BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
80 void BgzfStream::Close(void) {
82 // skip if file not open
86 // if writing to file, flush the current BGZF block,
87 // then write an empty block (as EOF marker)
90 const size_t blockLength = DeflateBlock();
91 fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
94 // flush and close stream
95 fflush(Resources.Stream);
96 fclose(Resources.Stream);
99 // reset initial state
105 IsWriteCompressed = true;
108 // compresses the current block
109 size_t BgzfStream::DeflateBlock(void) {
111 // initialize the gzip header
112 char* buffer = Resources.CompressedBlock;
113 memset(buffer, 0, 18);
114 buffer[0] = Constants::GZIP_ID1;
115 buffer[1] = Constants::GZIP_ID2;
116 buffer[2] = Constants::CM_DEFLATE;
117 buffer[3] = Constants::FLG_FEXTRA;
118 buffer[9] = Constants::OS_UNKNOWN;
119 buffer[10] = Constants::BGZF_XLEN;
120 buffer[12] = Constants::BGZF_ID1;
121 buffer[13] = Constants::BGZF_ID2;
122 buffer[14] = Constants::BGZF_LEN;
124 // set compression level
125 const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
127 // loop to retry for blocks that do not compress enough
128 int inputLength = BlockOffset;
129 size_t compressedLength = 0;
130 const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
134 // initialize zstream values
138 zs.next_in = (Bytef*)Resources.UncompressedBlock;
139 zs.avail_in = inputLength;
140 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
141 zs.avail_out = bufferSize -
142 Constants::BGZF_BLOCK_HEADER_LENGTH -
143 Constants::BGZF_BLOCK_FOOTER_LENGTH;
145 // initialize the zlib compression algorithm
146 int status = deflateInit2(&zs,
149 Constants::GZIP_WINDOW_BITS,
150 Constants::Z_DEFAULT_MEM_LEVEL,
152 if ( status != Z_OK )
153 throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
156 status = deflate(&zs, Z_FINISH);
158 // if not at stream end
159 if ( status != Z_STREAM_END ) {
164 if ( status != Z_OK )
165 throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
167 // not enough space available in buffer
168 // try to reduce the input length & re-start loop
170 if ( inputLength <= 0 )
171 throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
175 // finalize the compression routine
176 status = deflateEnd(&zs);
177 if ( status != Z_OK )
178 throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
180 // update compressedLength
181 compressedLength = zs.total_out +
182 Constants::BGZF_BLOCK_HEADER_LENGTH +
183 Constants::BGZF_BLOCK_FOOTER_LENGTH;
184 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
185 throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
191 // store the compressed length
192 BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
194 // store the CRC32 checksum
195 uint32_t crc = crc32(0, NULL, 0);
196 crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
197 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
198 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
200 // ensure that we have less than a block of data left
201 int remaining = BlockOffset - inputLength;
202 if ( remaining > 0 ) {
203 if ( remaining > inputLength )
204 throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
205 memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
208 // update block data & return compressedlength
209 BlockOffset = remaining;
210 return compressedLength;
213 // flushes the data in the BGZF block
214 void BgzfStream::FlushBlock(void) {
216 // flush all of the remaining blocks
217 while ( BlockOffset > 0 ) {
219 // compress the data block
220 const size_t blockLength = DeflateBlock();
222 // flush the data to our output stream
223 const size_t numBytesWritten = fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
224 if ( numBytesWritten != blockLength ) {
226 s << "expected to write " << blockLength
227 << " bytes during flushing, but wrote " << numBytesWritten;
228 throw BamException("BgzfStream::FlushBlock", s.str());
232 BlockAddress += blockLength;
236 // decompresses the current block
237 size_t BgzfStream::InflateBlock(const size_t& blockLength) {
239 // setup zlib stream object
243 zs.next_in = (Bytef*)Resources.CompressedBlock + 18;
244 zs.avail_in = blockLength - 16;
245 zs.next_out = (Bytef*)Resources.UncompressedBlock;
246 zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
249 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
250 if ( status != Z_OK )
251 throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
254 status = inflate(&zs, Z_FINISH);
255 if ( status != Z_STREAM_END ) {
257 throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
261 status = inflateEnd(&zs);
262 if ( status != Z_OK ) {
264 throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
271 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
272 void BgzfStream::Open(const string& filename, const char* mode) {
274 // make sure we're starting with fresh state
278 // determine open mode
279 if ( strcmp(mode, "rb") == 0 )
281 else if ( strcmp(mode, "wb") == 0)
284 const string message = string("unknown file mode: ") + mode;
285 throw BamException("BgzfStream::Open", message);
288 // open BGZF stream on a file
289 if ( (filename != "stdin") && (filename != "stdout") && (filename != "-"))
290 Resources.Stream = fopen(filename.c_str(), mode);
292 // open BGZF stream on stdin
293 else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) )
294 Resources.Stream = freopen(NULL, mode, stdin);
296 // open BGZF stream on stdout
297 else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
298 Resources.Stream = freopen(NULL, mode, stdout);
300 // ensure valid Stream
301 if ( !Resources.Stream ) {
302 const string message = string("unable to open file: ") + filename;
303 throw BamException("BgzfStream::Open", message);
310 // reads BGZF data into a byte buffer
311 size_t BgzfStream::Read(char* data, const size_t dataLength) {
313 // if stream not open for reading (or empty request)
314 if ( !IsOpen || IsWriteOnly || dataLength == 0 )
317 // read blocks as needed until desired data length is retrieved
318 size_t numBytesRead = 0;
319 while ( numBytesRead < dataLength ) {
321 // determine bytes available in current block
322 int bytesAvailable = BlockLength - BlockOffset;
324 // read (and decompress) next block if needed
325 if ( bytesAvailable <= 0 ) {
327 bytesAvailable = BlockLength - BlockOffset;
328 if ( bytesAvailable <= 0 )
332 // copy data from uncompressed source buffer into data destination buffer
333 const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
334 memcpy(data, Resources.UncompressedBlock + BlockOffset, copyLength);
337 BlockOffset += copyLength;
339 numBytesRead += copyLength;
343 if ( BlockOffset == BlockLength ) {
344 BlockAddress = ftell64(Resources.Stream);
349 // return actual number of bytes read
353 // reads a BGZF block
354 void BgzfStream::ReadBlock(void) {
357 int64_t blockAddress = ftell64(Resources.Stream);
359 // read block header from file
360 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
361 size_t count = fread(header, 1, Constants::BGZF_BLOCK_HEADER_LENGTH, Resources.Stream);
363 // if block header empty, set marker & skip rest of method
369 // if block header invalid size
370 if ( count != sizeof(header) )
371 throw BamException("BgzfStream::ReadBlock", "invalid block header size");
373 // validate block header contents
374 if ( !BgzfStream::CheckBlockHeader(header) )
375 throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
377 // copy header contents to compressed buffer
378 const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
379 memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
381 // read remainder of block
382 const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
383 count = fread(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Resources.Stream);
384 if ( count != remaining )
385 throw BamException("BgzfStream::ReadBlock", "could not read data from block");
387 // decompress block data
388 count = InflateBlock(blockLength);
390 // update block metadata
391 if ( BlockLength != 0 )
393 BlockAddress = blockAddress;
397 // seek to position in BGZF file
398 void BgzfStream::Seek(const int64_t& position) {
400 // determine adjusted offset & address
401 int blockOffset = (position & 0xFFFF);
402 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
404 // attempt seek in file
405 if ( fseek64(Resources.Stream, blockAddress, SEEK_SET) != 0 ) {
407 s << "unable to seek to position: " << position;
408 throw BamException("BgzfStream::Seek", s.str());
411 // if successful, update block metadata
413 BlockAddress = blockAddress;
414 BlockOffset = blockOffset;
417 void BgzfStream::SetWriteCompressed(bool ok) {
418 IsWriteCompressed = ok;
421 // get file position in BGZF file
422 int64_t BgzfStream::Tell(void) const {
425 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
428 // writes the supplied data into the BGZF buffer
429 size_t BgzfStream::Write(const char* data, const size_t dataLength) {
431 // skip if file not open for writing
432 if ( !IsOpen || !IsWriteOnly )
435 // write blocks as needed til all data is written
436 size_t numBytesWritten = 0;
437 const char* input = data;
438 const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
439 while ( numBytesWritten < dataLength ) {
441 // copy data contents to uncompressed output buffer
442 const size_t copyLength = min(blockLength - BlockOffset, dataLength - numBytesWritten);
443 char* buffer = Resources.UncompressedBlock;
444 memcpy(buffer + BlockOffset, input, copyLength);
447 BlockOffset += copyLength;
449 numBytesWritten += copyLength;
451 // flush (& compress) output buffer when full
452 if ( BlockOffset == blockLength )
456 // return actual number of bytes written
457 return numBytesWritten;