1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 September 2011(DB)
7 // ---------------------------------------------------------------------------
8 // Based on BGZF routines developed at the Broad Institute.
9 // Provides the basic functionality for reading & writing BGZF files
10 // Replaces the old BGZF.* files to avoid clashing with other toolkits
11 // ***************************************************************************
13 #include <api/internal/BamDeviceFactory_p.h>
14 #include <api/internal/BgzfStream_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
24 BgzfStream::BgzfStream(void)
25 : m_uncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
26 , m_compressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
30 , m_uncompressedBlock(NULL)
31 , m_compressedBlock(NULL)
33 , m_isWriteOnly(false)
34 , m_isWriteCompressed(true)
39 m_compressedBlock = new char[m_compressedBlockSize];
40 m_uncompressedBlock = new char[m_uncompressedBlockSize];
41 } catch( std::bad_alloc& ba ) {
42 fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
48 BgzfStream::~BgzfStream(void) {
49 if( m_compressedBlock ) delete[] m_compressedBlock;
50 if( m_uncompressedBlock ) delete[] m_uncompressedBlock;
54 void BgzfStream::Close(void) {
56 // skip if no device open
60 // if writing to file, flush the current BGZF block,
61 // then write an empty block (as EOF marker)
62 if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
64 int blockLength = DeflateBlock();
65 m_device->Write(m_compressedBlock, blockLength);
71 // clean up & reset flags
74 m_isWriteCompressed = true;
78 // compresses the current block
79 unsigned int BgzfStream::DeflateBlock(void) {
81 // initialize the gzip header
82 char* buffer = m_compressedBlock;
83 memset(buffer, 0, 18);
84 buffer[0] = Constants::GZIP_ID1;
85 buffer[1] = (char)Constants::GZIP_ID2;
86 buffer[2] = Constants::CM_DEFLATE;
87 buffer[3] = Constants::FLG_FEXTRA;
88 buffer[9] = (char)Constants::OS_UNKNOWN;
89 buffer[10] = Constants::BGZF_XLEN;
90 buffer[12] = Constants::BGZF_ID1;
91 buffer[13] = Constants::BGZF_ID2;
92 buffer[14] = Constants::BGZF_LEN;
94 // set compression level
95 const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
97 // loop to retry for blocks that do not compress enough
98 int inputLength = m_blockOffset;
99 unsigned int compressedLength = 0;
100 unsigned int bufferSize = m_compressedBlockSize;
104 // initialize zstream values
108 zs.next_in = (Bytef*)m_uncompressedBlock;
109 zs.avail_in = inputLength;
110 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
111 zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
113 // initialize the zlib compression algorithm
114 if ( deflateInit2(&zs,
117 Constants::GZIP_WINDOW_BITS,
118 Constants::Z_DEFAULT_MEM_LEVEL,
119 Z_DEFAULT_STRATEGY) != Z_OK )
121 fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n");
126 int status = deflate(&zs, Z_FINISH);
127 if ( status != Z_STREAM_END ) {
131 // reduce the input length and try again
132 if ( status == Z_OK ) {
134 if ( inputLength < 0 ) {
135 fprintf(stderr, "BgzfStream ERROR: input reduction failed\n");
141 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
145 // finalize the compression routine
146 if ( deflateEnd(&zs) != Z_OK ) {
147 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
151 compressedLength = zs.total_out;
152 compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH;
153 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) {
154 fprintf(stderr, "BgzfStream ERROR: deflate overflow\n");
161 // store the compressed length
162 BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
164 // store the CRC32 checksum
165 unsigned int crc = crc32(0, NULL, 0);
166 crc = crc32(crc, (Bytef*)m_uncompressedBlock, inputLength);
167 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
168 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
170 // ensure that we have less than a block of data left
171 int remaining = m_blockOffset - inputLength;
172 if ( remaining > 0 ) {
173 if ( remaining > inputLength ) {
174 fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
177 memcpy(m_uncompressedBlock, m_uncompressedBlock + inputLength, remaining);
181 m_blockOffset = remaining;
184 return compressedLength;
187 // flushes the data in the BGZF block
188 void BgzfStream::FlushBlock(void) {
190 BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
192 // flush all of the remaining blocks
193 while ( m_blockOffset > 0 ) {
195 // compress the data block
196 unsigned int blockLength = DeflateBlock();
198 // flush the data to our output stream
199 unsigned int numBytesWritten = m_device->Write(m_compressedBlock, blockLength);
200 if ( numBytesWritten != blockLength ) {
201 fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
202 blockLength, numBytesWritten);
207 m_blockAddress += blockLength;
211 // decompresses the current block
212 int BgzfStream::InflateBlock(const int& blockLength) {
214 // inflate the data from compressed buffer into uncompressed buffer
218 zs.next_in = (Bytef*)m_compressedBlock + 18;
219 zs.avail_in = blockLength - 16;
220 zs.next_out = (Bytef*)m_uncompressedBlock;
221 zs.avail_out = m_uncompressedBlockSize;
223 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
224 if ( status != Z_OK ) {
225 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n");
229 status = inflate(&zs, Z_FINISH);
230 if ( status != Z_STREAM_END ) {
232 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n");
236 status = inflateEnd(&zs);
237 if ( status != Z_OK ) {
238 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n");
246 bool BgzfStream::IsOpen(void) const {
249 return m_device->IsOpen();
252 bool BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
254 // close current device if necessary
258 BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
260 // retrieve new IO device depending on filename
261 m_device = BamDeviceFactory::CreateDevice(filename);
264 BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
266 // if device fails to open
267 if ( !m_device->Open(mode) ) {
268 cerr << "BgzfStream::Open() - unable to open IO device:" << endl;
269 cerr << m_device->ErrorString();
273 // otherwise, set flag & return true
275 m_isWriteOnly = ( mode == IBamIODevice::WriteOnly );
280 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
281 bool BgzfStream::Open(const string& filename, const char* mode) {
283 // close current stream, if necessary, before opening next
284 if ( m_isOpen ) Close();
286 // determine open mode
287 if ( strcmp(mode, "rb") == 0 )
288 m_isWriteOnly = false;
289 else if ( strcmp(mode, "wb") == 0)
290 m_isWriteOnly = true;
292 fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
296 // open BGZF stream on a file
297 if ( (filename != "stdin") && (filename != "stdout") && (filename != "-"))
298 m_stream = fopen(filename.c_str(), mode);
300 // open BGZF stream on stdin
301 else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) )
302 m_stream = freopen(NULL, mode, stdin);
304 // open BGZF stream on stdout
305 else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
306 m_stream = freopen(NULL, mode, stdout);
309 fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
313 // set flag & return success
318 // reads BGZF data into a byte buffer
319 unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) {
321 if ( dataLength == 0 )
324 // if stream not open for reading
325 BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
326 if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
329 // read blocks as needed until desired data length is retrieved
331 unsigned int numBytesRead = 0;
332 while ( numBytesRead < dataLength ) {
334 // determine bytes available in current block
335 int bytesAvailable = m_blockLength - m_blockOffset;
337 // read (and decompress) next block if needed
338 if ( bytesAvailable <= 0 ) {
339 if ( !ReadBlock() ) return -1;
340 bytesAvailable = m_blockLength - m_blockOffset;
341 if ( bytesAvailable <= 0 ) break;
344 // copy data from uncompressed source buffer into data destination buffer
345 char* buffer = m_uncompressedBlock;
346 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
347 memcpy(output, buffer + m_blockOffset, copyLength);
350 m_blockOffset += copyLength;
351 output += copyLength;
352 numBytesRead += copyLength;
356 if ( m_blockOffset == m_blockLength ) {
357 m_blockAddress = m_device->Tell();
365 // reads a BGZF block
366 bool BgzfStream::ReadBlock(void) {
368 BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
370 // store block's starting address
371 int64_t blockAddress = m_device->Tell();
373 // read block header from file
374 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
375 int numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
377 // if block header empty
378 if ( numBytesRead == 0 ) {
383 // if block header invalid size
384 if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) {
385 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
389 // validate block header contents
390 if ( !BgzfStream::CheckBlockHeader(header) ) {
391 fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
395 // copy header contents to compressed buffer
396 int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
397 char* compressedBlock = m_compressedBlock;
398 memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
399 int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
401 // read remainder of block
402 numBytesRead = m_device->Read(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
403 if ( numBytesRead != remaining ) {
404 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
408 // decompress block data
409 numBytesRead = InflateBlock(blockLength);
410 if ( numBytesRead < 0 ) {
411 fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
416 if ( m_blockLength != 0 )
418 m_blockAddress = blockAddress;
419 m_blockLength = numBytesRead;
425 // seek to position in BGZF file
426 bool BgzfStream::Seek(const int64_t& position) {
428 BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
430 // skip if not open or not seek-able
431 if ( !IsOpen() /*|| !m_device->IsRandomAccess()*/ ) {
432 cerr << "BgzfStream::Seek() - device not open" << endl;
436 // determine adjusted offset & address
437 int blockOffset = (position & 0xFFFF);
438 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
440 // attempt seek in file
441 if ( !m_device->Seek(blockAddress) ) {
442 cerr << "BgzfStream ERROR: unable to seek in file" << endl;
446 // update block data & return success
448 m_blockAddress = blockAddress;
449 m_blockOffset = blockOffset;
453 void BgzfStream::SetWriteCompressed(bool ok) {
454 m_isWriteCompressed = ok;
457 // get file position in BGZF file
458 int64_t BgzfStream::Tell(void) const {
459 if ( !m_isOpen ) return 0;
460 return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
463 // writes the supplied data into the BGZF buffer
464 unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) {
466 BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
467 BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
468 "BgzfStream::Write() - trying to write to non-writable IO device");
470 // write blocks as needed til all data is written
471 unsigned int numBytesWritten = 0;
472 const char* input = data;
473 unsigned int blockLength = m_uncompressedBlockSize;
474 while ( numBytesWritten < dataLength ) {
476 // copy data contents to uncompressed output buffer
477 unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
478 char* buffer = m_uncompressedBlock;
479 memcpy(buffer + m_blockOffset, input, copyLength);
482 m_blockOffset += copyLength;
484 numBytesWritten += copyLength;
486 // flush (& compress) output buffer when full
487 if ( m_blockOffset == blockLength )
492 return numBytesWritten;