1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 9 September 2011(DB)
6 // ---------------------------------------------------------------------------
7 // Based on BGZF routines developed at the Broad Institute.
8 // Provides the basic functionality for reading & writing BGZF files
9 // Replaces the old BGZF.* files to avoid clashing with other toolkits
10 // ***************************************************************************
12 #include <api/internal/BamDeviceFactory_p.h>
13 #include <api/internal/BgzfStream_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
23 BgzfStream::BgzfStream(void)
24 : m_uncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
25 , m_compressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
29 , m_uncompressedBlock(NULL)
30 , m_compressedBlock(NULL)
32 , m_isWriteOnly(false)
33 , m_isWriteCompressed(true)
38 m_compressedBlock = new char[m_compressedBlockSize];
39 m_uncompressedBlock = new char[m_uncompressedBlockSize];
40 } catch( std::bad_alloc& ba ) {
41 fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
47 BgzfStream::~BgzfStream(void) {
48 if( m_compressedBlock ) delete[] m_compressedBlock;
49 if( m_uncompressedBlock ) delete[] m_uncompressedBlock;
53 void BgzfStream::Close(void) {
55 // skip if no device open
59 // if writing to file, flush the current BGZF block,
60 // then write an empty block (as EOF marker)
61 if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
63 int blockLength = DeflateBlock();
64 m_device->Write(m_compressedBlock, blockLength);
70 // clean up & reset flags
73 m_isWriteCompressed = true;
77 // compresses the current block
78 unsigned int BgzfStream::DeflateBlock(void) {
80 // initialize the gzip header
81 char* buffer = m_compressedBlock;
82 memset(buffer, 0, 18);
83 buffer[0] = Constants::GZIP_ID1;
84 buffer[1] = (char)Constants::GZIP_ID2;
85 buffer[2] = Constants::CM_DEFLATE;
86 buffer[3] = Constants::FLG_FEXTRA;
87 buffer[9] = (char)Constants::OS_UNKNOWN;
88 buffer[10] = Constants::BGZF_XLEN;
89 buffer[12] = Constants::BGZF_ID1;
90 buffer[13] = Constants::BGZF_ID2;
91 buffer[14] = Constants::BGZF_LEN;
93 // set compression level
94 const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
96 // loop to retry for blocks that do not compress enough
97 int inputLength = m_blockOffset;
98 unsigned int compressedLength = 0;
99 unsigned int bufferSize = m_compressedBlockSize;
103 // initialize zstream values
107 zs.next_in = (Bytef*)m_uncompressedBlock;
108 zs.avail_in = inputLength;
109 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
110 zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
112 // initialize the zlib compression algorithm
113 if ( deflateInit2(&zs,
116 Constants::GZIP_WINDOW_BITS,
117 Constants::Z_DEFAULT_MEM_LEVEL,
118 Z_DEFAULT_STRATEGY) != Z_OK )
120 fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n");
125 int status = deflate(&zs, Z_FINISH);
126 if ( status != Z_STREAM_END ) {
130 // reduce the input length and try again
131 if ( status == Z_OK ) {
133 if ( inputLength < 0 ) {
134 fprintf(stderr, "BgzfStream ERROR: input reduction failed\n");
140 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
144 // finalize the compression routine
145 if ( deflateEnd(&zs) != Z_OK ) {
146 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
150 compressedLength = zs.total_out;
151 compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH;
152 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) {
153 fprintf(stderr, "BgzfStream ERROR: deflate overflow\n");
160 // store the compressed length
161 BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
163 // store the CRC32 checksum
164 unsigned int crc = crc32(0, NULL, 0);
165 crc = crc32(crc, (Bytef*)m_uncompressedBlock, inputLength);
166 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
167 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
169 // ensure that we have less than a block of data left
170 int remaining = m_blockOffset - inputLength;
171 if ( remaining > 0 ) {
172 if ( remaining > inputLength ) {
173 fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
176 memcpy(m_uncompressedBlock, m_uncompressedBlock + inputLength, remaining);
180 m_blockOffset = remaining;
183 return compressedLength;
186 // flushes the data in the BGZF block
187 void BgzfStream::FlushBlock(void) {
189 BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
191 // flush all of the remaining blocks
192 while ( m_blockOffset > 0 ) {
194 // compress the data block
195 unsigned int blockLength = DeflateBlock();
197 // flush the data to our output stream
198 unsigned int numBytesWritten = m_device->Write(m_compressedBlock, blockLength);
199 if ( numBytesWritten != blockLength ) {
200 fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
201 blockLength, numBytesWritten);
206 m_blockAddress += blockLength;
210 // decompresses the current block
211 int BgzfStream::InflateBlock(const int& blockLength) {
213 // inflate the data from compressed buffer into uncompressed buffer
217 zs.next_in = (Bytef*)m_compressedBlock + 18;
218 zs.avail_in = blockLength - 16;
219 zs.next_out = (Bytef*)m_uncompressedBlock;
220 zs.avail_out = m_uncompressedBlockSize;
222 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
223 if ( status != Z_OK ) {
224 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n");
228 status = inflate(&zs, Z_FINISH);
229 if ( status != Z_STREAM_END ) {
231 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n");
235 status = inflateEnd(&zs);
236 if ( status != Z_OK ) {
237 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n");
245 bool BgzfStream::IsOpen(void) const {
248 return m_device->IsOpen();
251 bool BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
253 // close current device if necessary
257 BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
259 // retrieve new IO device depending on filename
260 m_device = BamDeviceFactory::CreateDevice(filename);
263 BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
265 // if device fails to open
266 if ( !m_device->Open(mode) ) {
267 cerr << "BgzfStream::Open() - unable to open IO device:" << endl;
268 cerr << m_device->ErrorString();
272 // otherwise, set flag & return true
274 m_isWriteOnly = ( mode == IBamIODevice::WriteOnly );
279 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
280 bool BgzfStream::Open(const string& filename, const char* mode) {
282 // close current stream, if necessary, before opening next
283 if ( m_isOpen ) Close();
285 // determine open mode
286 if ( strcmp(mode, "rb") == 0 )
287 m_isWriteOnly = false;
288 else if ( strcmp(mode, "wb") == 0)
289 m_isWriteOnly = true;
291 fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
295 // open BGZF stream on a file
296 if ( (filename != "stdin") && (filename != "stdout") && (filename != "-"))
297 m_stream = fopen(filename.c_str(), mode);
299 // open BGZF stream on stdin
300 else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) )
301 m_stream = freopen(NULL, mode, stdin);
303 // open BGZF stream on stdout
304 else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
305 m_stream = freopen(NULL, mode, stdout);
308 fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
312 // set flag & return success
317 // reads BGZF data into a byte buffer
318 unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) {
320 if ( dataLength == 0 )
323 // if stream not open for reading
324 BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
325 if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
328 // read blocks as needed until desired data length is retrieved
330 unsigned int numBytesRead = 0;
331 while ( numBytesRead < dataLength ) {
333 // determine bytes available in current block
334 int bytesAvailable = m_blockLength - m_blockOffset;
336 // read (and decompress) next block if needed
337 if ( bytesAvailable <= 0 ) {
338 if ( !ReadBlock() ) return -1;
339 bytesAvailable = m_blockLength - m_blockOffset;
340 if ( bytesAvailable <= 0 ) break;
343 // copy data from uncompressed source buffer into data destination buffer
344 char* buffer = m_uncompressedBlock;
345 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
346 memcpy(output, buffer + m_blockOffset, copyLength);
349 m_blockOffset += copyLength;
350 output += copyLength;
351 numBytesRead += copyLength;
355 if ( m_blockOffset == m_blockLength ) {
356 m_blockAddress = m_device->Tell();
364 // reads a BGZF block
365 bool BgzfStream::ReadBlock(void) {
367 BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
369 // store block's starting address
370 int64_t blockAddress = m_device->Tell();
372 // read block header from file
373 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
374 int numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
376 // if block header empty
377 if ( numBytesRead == 0 ) {
382 // if block header invalid size
383 if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) {
384 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
388 // validate block header contents
389 if ( !BgzfStream::CheckBlockHeader(header) ) {
390 fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
394 // copy header contents to compressed buffer
395 int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
396 char* compressedBlock = m_compressedBlock;
397 memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
398 int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
400 // read remainder of block
401 numBytesRead = m_device->Read(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
402 if ( numBytesRead != remaining ) {
403 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
407 // decompress block data
408 numBytesRead = InflateBlock(blockLength);
409 if ( numBytesRead < 0 ) {
410 fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
415 if ( m_blockLength != 0 )
417 m_blockAddress = blockAddress;
418 m_blockLength = numBytesRead;
424 // seek to position in BGZF file
425 bool BgzfStream::Seek(const int64_t& position) {
427 BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
429 // skip if not open or not seek-able
430 if ( !IsOpen() /*|| !m_device->IsRandomAccess()*/ ) {
431 cerr << "BgzfStream::Seek() - device not open" << endl;
435 // determine adjusted offset & address
436 int blockOffset = (position & 0xFFFF);
437 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
439 // attempt seek in file
440 if ( !m_device->Seek(blockAddress) ) {
441 cerr << "BgzfStream ERROR: unable to seek in file" << endl;
445 // update block data & return success
447 m_blockAddress = blockAddress;
448 m_blockOffset = blockOffset;
452 void BgzfStream::SetWriteCompressed(bool ok) {
453 m_isWriteCompressed = ok;
456 // get file position in BGZF file
457 int64_t BgzfStream::Tell(void) const {
458 if ( !m_isOpen ) return 0;
459 return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
462 // writes the supplied data into the BGZF buffer
463 unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) {
465 BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
466 BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
467 "BgzfStream::Write() - trying to write to non-writable IO device");
469 // write blocks as needed til all data is written
470 unsigned int numBytesWritten = 0;
471 const char* input = data;
472 unsigned int blockLength = m_uncompressedBlockSize;
473 while ( numBytesWritten < dataLength ) {
475 // copy data contents to uncompressed output buffer
476 unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
477 char* buffer = m_uncompressedBlock;
478 memcpy(buffer + m_blockOffset, input, copyLength);
481 m_blockOffset += copyLength;
483 numBytesWritten += copyLength;
485 // flush (& compress) output buffer when full
486 if ( m_blockOffset == blockLength )
491 return numBytesWritten;