1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 11 October 2011(DB)
6 // ---------------------------------------------------------------------------
7 // Based on BGZF routines developed at the Broad Institute.
8 // Provides the basic functionality for reading & writing BGZF files
9 // Replaces the old BGZF.* files to avoid clashing with other toolkits
10 // ***************************************************************************
12 #include "api/BamAux.h"
13 #include "api/BamConstants.h"
14 #include "api/internal/BamDeviceFactory_p.h"
15 #include "api/internal/BamException_p.h"
16 #include "api/internal/BgzfStream_p.h"
17 using namespace BamTools;
18 using namespace BamTools::Internal;
28 // ----------------------------
29 // RaiiWrapper implementation
30 // ----------------------------
32 BgzfStream::RaiiWrapper::RaiiWrapper(void) {
33 CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
34 UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
37 BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
40 delete[] CompressedBlock;
41 delete[] UncompressedBlock;
43 UncompressedBlock = 0;
46 // ---------------------------
47 // BgzfStream implementation
48 // ---------------------------
51 BgzfStream::BgzfStream(void)
55 , m_isWriteCompressed(true)
60 BgzfStream::~BgzfStream(void) {
64 // checks BGZF block header
65 bool BgzfStream::CheckBlockHeader(char* header) {
66 return (header[0] == Constants::GZIP_ID1 &&
67 header[1] == Constants::GZIP_ID2 &&
68 header[2] == Z_DEFLATED &&
69 (header[3] & Constants::FLG_FEXTRA) != 0 &&
70 BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
71 header[12] == Constants::BGZF_ID1 &&
72 header[13] == Constants::BGZF_ID2 &&
73 BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
77 void BgzfStream::Close(void) {
79 // skip if no device open
80 if ( m_device == 0 ) return;
82 // if writing to file, flush the current BGZF block,
83 // then write an empty block (as EOF marker)
84 if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
86 const size_t blockLength = DeflateBlock();
87 m_device->Write(Resources.CompressedBlock, blockLength);
99 m_isWriteCompressed = true;
102 // compresses the current block
103 size_t BgzfStream::DeflateBlock(void) {
105 // initialize the gzip header
106 char* buffer = Resources.CompressedBlock;
107 memset(buffer, 0, 18);
108 buffer[0] = Constants::GZIP_ID1;
109 buffer[1] = Constants::GZIP_ID2;
110 buffer[2] = Constants::CM_DEFLATE;
111 buffer[3] = Constants::FLG_FEXTRA;
112 buffer[9] = Constants::OS_UNKNOWN;
113 buffer[10] = Constants::BGZF_XLEN;
114 buffer[12] = Constants::BGZF_ID1;
115 buffer[13] = Constants::BGZF_ID2;
116 buffer[14] = Constants::BGZF_LEN;
118 // set compression level
119 const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
121 // loop to retry for blocks that do not compress enough
122 int inputLength = m_blockOffset;
123 size_t compressedLength = 0;
124 const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
128 // initialize zstream values
132 zs.next_in = (Bytef*)Resources.UncompressedBlock;
133 zs.avail_in = inputLength;
134 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
135 zs.avail_out = bufferSize -
136 Constants::BGZF_BLOCK_HEADER_LENGTH -
137 Constants::BGZF_BLOCK_FOOTER_LENGTH;
139 // initialize the zlib compression algorithm
140 int status = deflateInit2(&zs,
143 Constants::GZIP_WINDOW_BITS,
144 Constants::Z_DEFAULT_MEM_LEVEL,
146 if ( status != Z_OK )
147 throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
150 status = deflate(&zs, Z_FINISH);
152 // if not at stream end
153 if ( status != Z_STREAM_END ) {
157 // there was not enough space available in buffer
158 // try to reduce the input length & re-start loop
159 if ( status == Z_OK ) {
161 if ( inputLength < 0 )
162 throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
166 throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
169 // finalize the compression routine
170 status = deflateEnd(&zs);
171 if ( status != Z_OK )
172 throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
174 // update compressedLength
175 compressedLength = zs.total_out +
176 Constants::BGZF_BLOCK_HEADER_LENGTH +
177 Constants::BGZF_BLOCK_FOOTER_LENGTH;
178 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
179 throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
185 // store the compressed length
186 BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
188 // store the CRC32 checksum
189 uint32_t crc = crc32(0, NULL, 0);
190 crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
191 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
192 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
194 // ensure that we have less than a block of data left
195 int remaining = m_blockOffset - inputLength;
196 if ( remaining > 0 ) {
197 if ( remaining > inputLength )
198 throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
199 memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
203 m_blockOffset = remaining;
206 return compressedLength;
209 // flushes the data in the BGZF block
210 void BgzfStream::FlushBlock(void) {
212 BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
214 // flush all of the remaining blocks
215 while ( m_blockOffset > 0 ) {
217 // compress the data block
218 const size_t blockLength = DeflateBlock();
220 // flush the data to our output device
221 const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength);
222 if ( numBytesWritten != blockLength ) {
224 s << "expected to write " << blockLength
225 << " bytes during flushing, but wrote " << numBytesWritten;
226 throw BamException("BgzfStream::FlushBlock", s.str());
230 m_blockAddress += blockLength;
234 // decompresses the current block
235 size_t BgzfStream::InflateBlock(const size_t& blockLength) {
237 // setup zlib stream object
241 zs.next_in = (Bytef*)Resources.CompressedBlock + 18;
242 zs.avail_in = blockLength - 16;
243 zs.next_out = (Bytef*)Resources.UncompressedBlock;
244 zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
247 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
248 if ( status != Z_OK )
249 throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
252 status = inflate(&zs, Z_FINISH);
253 if ( status != Z_STREAM_END ) {
255 throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
259 status = inflateEnd(&zs);
260 if ( status != Z_OK ) {
262 throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
269 bool BgzfStream::IsOpen(void) const {
272 return m_device->IsOpen();
275 void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
277 // close current device if necessary
279 BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
281 // retrieve new IO device depending on filename
282 m_device = BamDeviceFactory::CreateDevice(filename);
283 BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
285 // if device fails to open
286 if ( !m_device->Open(mode) ) {
287 const string deviceError = m_device->GetErrorString();
288 const string message = string("could not open BGZF stream: \n\t") + deviceError;
289 throw BamException("BgzfStream::Open", message);
293 // reads BGZF data into a byte buffer
294 size_t BgzfStream::Read(char* data, const size_t dataLength) {
296 if ( dataLength == 0 )
299 // if stream not open for reading
300 BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
301 if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
304 // read blocks as needed until desired data length is retrieved
306 size_t numBytesRead = 0;
307 while ( numBytesRead < dataLength ) {
309 // determine bytes available in current block
310 int bytesAvailable = m_blockLength - m_blockOffset;
312 // read (and decompress) next block if needed
313 if ( bytesAvailable <= 0 ) {
315 bytesAvailable = m_blockLength - m_blockOffset;
316 if ( bytesAvailable <= 0 )
320 // copy data from uncompressed source buffer into data destination buffer
321 const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
322 memcpy(output, Resources.UncompressedBlock + m_blockOffset, copyLength);
325 m_blockOffset += copyLength;
326 output += copyLength;
327 numBytesRead += copyLength;
331 if ( m_blockOffset == m_blockLength ) {
332 m_blockAddress = m_device->Tell();
338 // return actual number of bytes read
342 // reads a BGZF block
343 void BgzfStream::ReadBlock(void) {
345 BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
347 // store block's starting address
348 int64_t blockAddress = m_device->Tell();
350 // read block header from file
351 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
352 size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
354 // if block header empty
355 if ( numBytesRead == 0 ) {
360 // if block header invalid size
361 if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH )
362 throw BamException("BgzfStream::ReadBlock", "invalid block header size");
364 // validate block header contents
365 if ( !BgzfStream::CheckBlockHeader(header) )
366 throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
368 // copy header contents to compressed buffer
369 const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
370 memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
372 // read remainder of block
373 const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
374 numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
375 if ( numBytesRead != remaining )
376 throw BamException("BgzfStream::ReadBlock", "could not read data from block");
378 // decompress block data
379 numBytesRead = InflateBlock(blockLength);
382 if ( m_blockLength != 0 )
384 m_blockAddress = blockAddress;
385 m_blockLength = numBytesRead;
388 // seek to position in BGZF file
389 void BgzfStream::Seek(const int64_t& position) {
391 BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
393 // skip if device is not open
394 if ( !IsOpen() ) return;
396 // determine adjusted offset & address
397 int blockOffset = (position & 0xFFFF);
398 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
400 // attempt seek in file
401 if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) {
403 // update block data & return success
405 m_blockAddress = blockAddress;
406 m_blockOffset = blockOffset;
410 s << "unable to seek to position: " << position;
411 throw BamException("BgzfStream::Seek", s.str());
415 void BgzfStream::SetWriteCompressed(bool ok) {
416 m_isWriteCompressed = ok;
419 // get file position in BGZF file
420 int64_t BgzfStream::Tell(void) const {
423 return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
426 // writes the supplied data into the BGZF buffer
427 size_t BgzfStream::Write(const char* data, const size_t dataLength) {
429 BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
430 BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
431 "BgzfStream::Write() - trying to write to non-writable IO device");
433 // skip if file not open for writing
437 // write blocks as needed til all data is written
438 size_t numBytesWritten = 0;
439 const char* input = data;
440 const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
441 while ( numBytesWritten < dataLength ) {
443 // copy data contents to uncompressed output buffer
444 unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
445 char* buffer = Resources.UncompressedBlock;
446 memcpy(buffer + m_blockOffset, input, copyLength);
449 m_blockOffset += copyLength;
451 numBytesWritten += copyLength;
453 // flush (& compress) output buffer when full
454 if ( m_blockOffset == blockLength )
458 // return actual number of bytes written
459 return numBytesWritten;