1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 5 April 2011(DB)
7 // ---------------------------------------------------------------------------
8 // Based on BGZF routines developed at the Broad Institute.
9 // Provides the basic functionality for reading & writing BGZF files
10 // Replaces the old BGZF.* files to avoid clashing with other toolkits
11 // ***************************************************************************
13 #include <api/internal/BgzfStream_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 BgzfStream::BgzfStream(void)
23 : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
24 , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
30 , IsWriteCompressed(true)
32 , UncompressedBlock(NULL)
33 , CompressedBlock(NULL)
36 CompressedBlock = new char[CompressedBlockSize];
37 UncompressedBlock = new char[UncompressedBlockSize];
38 } catch( std::bad_alloc& ba ) {
39 fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
45 BgzfStream::~BgzfStream(void) {
46 if( CompressedBlock ) delete[] CompressedBlock;
47 if( UncompressedBlock ) delete[] UncompressedBlock;
51 void BgzfStream::Close(void) {
53 // skip if file not open
54 if ( !IsOpen ) return;
56 // if writing to file, flush the current BGZF block,
57 // then write an empty block (as EOF marker)
60 int blockLength = DeflateBlock();
61 fwrite(CompressedBlock, 1, blockLength, Stream);
64 // flush and close stream
69 IsWriteCompressed = true;
73 // compresses the current block
74 int BgzfStream::DeflateBlock(void) {
76 // initialize the gzip header
77 char* buffer = CompressedBlock;
78 memset(buffer, 0, 18);
79 buffer[0] = Constants::GZIP_ID1;
80 buffer[1] = (char)Constants::GZIP_ID2;
81 buffer[2] = Constants::CM_DEFLATE;
82 buffer[3] = Constants::FLG_FEXTRA;
83 buffer[9] = (char)Constants::OS_UNKNOWN;
84 buffer[10] = Constants::BGZF_XLEN;
85 buffer[12] = Constants::BGZF_ID1;
86 buffer[13] = Constants::BGZF_ID2;
87 buffer[14] = Constants::BGZF_LEN;
89 // set compression level
90 const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
92 // loop to retry for blocks that do not compress enough
93 int inputLength = BlockOffset;
94 int compressedLength = 0;
95 unsigned int bufferSize = CompressedBlockSize;
99 // initialize zstream values
103 zs.next_in = (Bytef*)UncompressedBlock;
104 zs.avail_in = inputLength;
105 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
106 zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
108 // initialize the zlib compression algorithm
109 if ( deflateInit2(&zs,
112 Constants::GZIP_WINDOW_BITS,
113 Constants::Z_DEFAULT_MEM_LEVEL,
114 Z_DEFAULT_STRATEGY) != Z_OK )
116 fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n");
121 int status = deflate(&zs, Z_FINISH);
122 if ( status != Z_STREAM_END ) {
126 // reduce the input length and try again
127 if ( status == Z_OK ) {
129 if ( inputLength < 0 ) {
130 fprintf(stderr, "BgzfStream ERROR: input reduction failed\n");
136 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
140 // finalize the compression routine
141 if ( deflateEnd(&zs) != Z_OK ) {
142 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
146 compressedLength = zs.total_out;
147 compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH;
148 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) {
149 fprintf(stderr, "BgzfStream ERROR: deflate overflow\n");
156 // store the compressed length
157 BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
159 // store the CRC32 checksum
160 unsigned int crc = crc32(0, NULL, 0);
161 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
162 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
163 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
165 // ensure that we have less than a block of data left
166 int remaining = BlockOffset - inputLength;
167 if ( remaining > 0 ) {
168 if ( remaining > inputLength ) {
169 fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
172 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
176 BlockOffset = remaining;
179 return compressedLength;
182 // flushes the data in the BGZF block
183 void BgzfStream::FlushBlock(void) {
185 // flush all of the remaining blocks
186 while ( BlockOffset > 0 ) {
188 // compress the data block
189 int blockLength = DeflateBlock();
191 // flush the data to our output stream
192 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
193 if ( numBytesWritten != blockLength ) {
194 fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
195 blockLength, numBytesWritten);
200 BlockAddress += blockLength;
204 // decompresses the current block
205 int BgzfStream::InflateBlock(const int& blockLength) {
207 // inflate the data from compressed buffer into uncompressed buffer
211 zs.next_in = (Bytef*)CompressedBlock + 18;
212 zs.avail_in = blockLength - 16;
213 zs.next_out = (Bytef*)UncompressedBlock;
214 zs.avail_out = UncompressedBlockSize;
216 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
217 if ( status != Z_OK ) {
218 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n");
222 status = inflate(&zs, Z_FINISH);
223 if ( status != Z_STREAM_END ) {
225 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n");
229 status = inflateEnd(&zs);
230 if ( status != Z_OK ) {
231 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n");
239 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
240 bool BgzfStream::Open(const string& filename, const char* mode) {
242 // close current stream, if necessary, before opening next
243 if ( IsOpen ) Close();
245 // determine open mode
246 if ( strcmp(mode, "rb") == 0 )
248 else if ( strcmp(mode, "wb") == 0)
251 fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
255 // open BGZF stream on a file
256 if ( (filename != "stdin") && (filename != "stdout") )
257 Stream = fopen(filename.c_str(), mode);
259 // open BGZF stream on stdin
260 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
261 Stream = freopen(NULL, mode, stdin);
263 // open BGZF stream on stdout
264 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
265 Stream = freopen(NULL, mode, stdout);
268 fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
272 // set flag & return success
277 // reads BGZF data into a byte buffer
278 int BgzfStream::Read(char* data, const unsigned int dataLength) {
280 // if stream not open for reading (or empty request)
281 if ( !IsOpen || IsWriteOnly || dataLength == 0 )
284 // read blocks as needed until desired data length is retrieved
286 unsigned int numBytesRead = 0;
287 while ( numBytesRead < dataLength ) {
289 // determine bytes available in current block
290 int bytesAvailable = BlockLength - BlockOffset;
292 // read (and decompress) next block if needed
293 if ( bytesAvailable <= 0 ) {
294 if ( !ReadBlock() ) return -1;
295 bytesAvailable = BlockLength - BlockOffset;
296 if ( bytesAvailable <= 0 ) break;
299 // copy data from uncompressed source buffer into data destination buffer
300 char* buffer = UncompressedBlock;
301 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
302 memcpy(output, buffer + BlockOffset, copyLength);
305 BlockOffset += copyLength;
306 output += copyLength;
307 numBytesRead += copyLength;
311 if ( BlockOffset == BlockLength ) {
312 BlockAddress = ftell64(Stream);
320 // reads a BGZF block
321 bool BgzfStream::ReadBlock(void) {
323 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
324 int64_t blockAddress = ftell64(Stream);
326 // read block header from file
327 int count = fread(header, 1, sizeof(header), Stream);
329 // if block header empty
335 // if block header invalid size
336 if ( count != sizeof(header) ) {
337 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
341 // validate block header contents
342 if ( !BgzfStream::CheckBlockHeader(header) ) {
343 fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
347 // copy header contents to compressed buffer
348 int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
349 char* compressedBlock = CompressedBlock;
350 memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
351 int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
353 // read remainder of block
354 count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream);
355 if ( count != remaining ) {
356 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
360 // decompress block data
361 count = InflateBlock(blockLength);
363 fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
368 if ( BlockLength != 0 )
370 BlockAddress = blockAddress;
377 // seek to position in BGZF file
378 bool BgzfStream::Seek(const int64_t& position) {
381 if ( !IsOpen ) return false;
383 // determine adjusted offset & address
384 int blockOffset = (position & 0xFFFF);
385 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
387 // attempt seek in file
388 if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
389 fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n");
393 // update block data & return success
395 BlockAddress = blockAddress;
396 BlockOffset = blockOffset;
400 void BgzfStream::SetWriteCompressed(bool ok) {
401 IsWriteCompressed = ok;
404 // get file position in BGZF file
405 int64_t BgzfStream::Tell(void) const {
408 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
411 // writes the supplied data into the BGZF buffer
412 unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) {
414 // skip if file not open for writing
415 if ( !IsOpen || !IsWriteOnly ) return false;
417 // write blocks as needed til all data is written
418 unsigned int numBytesWritten = 0;
419 const char* input = data;
420 unsigned int blockLength = UncompressedBlockSize;
421 while ( numBytesWritten < dataLen ) {
423 // copy data contents to uncompressed output buffer
424 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
425 char* buffer = UncompressedBlock;
426 memcpy(buffer + BlockOffset, input, copyLength);
429 BlockOffset += copyLength;
431 numBytesWritten += copyLength;
433 // flush (& compress) output buffer when full
434 if ( BlockOffset == blockLength ) FlushBlock();
438 return numBytesWritten;