1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 2 September 2011(DB)
6 // ---------------------------------------------------------------------------
7 // Based on BGZF routines developed at the Broad Institute.
8 // Provides the basic functionality for reading & writing BGZF files
9 // Replaces the old BGZF.* files to avoid clashing with other toolkits
10 // ***************************************************************************
12 #include <api/internal/BgzfStream_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
21 BgzfStream::BgzfStream(void)
22 : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
23 , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
29 , IsWriteCompressed(true)
31 , UncompressedBlock(NULL)
32 , CompressedBlock(NULL)
35 CompressedBlock = new char[CompressedBlockSize];
36 UncompressedBlock = new char[UncompressedBlockSize];
37 } catch( std::bad_alloc& ba ) {
38 fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
44 BgzfStream::~BgzfStream(void) {
45 if( CompressedBlock ) delete[] CompressedBlock;
46 if( UncompressedBlock ) delete[] UncompressedBlock;
50 void BgzfStream::Close(void) {
52 // skip if file not open
53 if ( !IsOpen ) return;
55 // if writing to file, flush the current BGZF block,
56 // then write an empty block (as EOF marker)
59 int blockLength = DeflateBlock();
60 fwrite(CompressedBlock, 1, blockLength, Stream);
63 // flush and close stream
68 IsWriteCompressed = true;
72 // compresses the current block
73 int BgzfStream::DeflateBlock(void) {
75 // initialize the gzip header
76 char* buffer = CompressedBlock;
77 memset(buffer, 0, 18);
78 buffer[0] = Constants::GZIP_ID1;
79 buffer[1] = (char)Constants::GZIP_ID2;
80 buffer[2] = Constants::CM_DEFLATE;
81 buffer[3] = Constants::FLG_FEXTRA;
82 buffer[9] = (char)Constants::OS_UNKNOWN;
83 buffer[10] = Constants::BGZF_XLEN;
84 buffer[12] = Constants::BGZF_ID1;
85 buffer[13] = Constants::BGZF_ID2;
86 buffer[14] = Constants::BGZF_LEN;
88 // set compression level
89 const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
91 // loop to retry for blocks that do not compress enough
92 int inputLength = BlockOffset;
93 int compressedLength = 0;
94 unsigned int bufferSize = CompressedBlockSize;
98 // initialize zstream values
102 zs.next_in = (Bytef*)UncompressedBlock;
103 zs.avail_in = inputLength;
104 zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
105 zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
107 // initialize the zlib compression algorithm
108 if ( deflateInit2(&zs,
111 Constants::GZIP_WINDOW_BITS,
112 Constants::Z_DEFAULT_MEM_LEVEL,
113 Z_DEFAULT_STRATEGY) != Z_OK )
115 fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n");
120 int status = deflate(&zs, Z_FINISH);
121 if ( status != Z_STREAM_END ) {
125 // reduce the input length and try again
126 if ( status == Z_OK ) {
128 if ( inputLength < 0 ) {
129 fprintf(stderr, "BgzfStream ERROR: input reduction failed\n");
135 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
139 // finalize the compression routine
140 if ( deflateEnd(&zs) != Z_OK ) {
141 fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
145 compressedLength = zs.total_out;
146 compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH;
147 if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) {
148 fprintf(stderr, "BgzfStream ERROR: deflate overflow\n");
155 // store the compressed length
156 BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
158 // store the CRC32 checksum
159 unsigned int crc = crc32(0, NULL, 0);
160 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
161 BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
162 BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
164 // ensure that we have less than a block of data left
165 int remaining = BlockOffset - inputLength;
166 if ( remaining > 0 ) {
167 if ( remaining > inputLength ) {
168 fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
171 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
175 BlockOffset = remaining;
178 return compressedLength;
181 // flushes the data in the BGZF block
182 void BgzfStream::FlushBlock(void) {
184 // flush all of the remaining blocks
185 while ( BlockOffset > 0 ) {
187 // compress the data block
188 int blockLength = DeflateBlock();
190 // flush the data to our output stream
191 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
192 if ( numBytesWritten != blockLength ) {
193 fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
194 blockLength, numBytesWritten);
199 BlockAddress += blockLength;
203 // decompresses the current block
204 int BgzfStream::InflateBlock(const int& blockLength) {
206 // inflate the data from compressed buffer into uncompressed buffer
210 zs.next_in = (Bytef*)CompressedBlock + 18;
211 zs.avail_in = blockLength - 16;
212 zs.next_out = (Bytef*)UncompressedBlock;
213 zs.avail_out = UncompressedBlockSize;
215 int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
216 if ( status != Z_OK ) {
217 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n");
221 status = inflate(&zs, Z_FINISH);
222 if ( status != Z_STREAM_END ) {
224 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n");
228 status = inflateEnd(&zs);
229 if ( status != Z_OK ) {
230 fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n");
238 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
239 bool BgzfStream::Open(const string& filename, const char* mode) {
241 // close current stream, if necessary, before opening next
242 if ( IsOpen ) Close();
244 // determine open mode
245 if ( strcmp(mode, "rb") == 0 )
247 else if ( strcmp(mode, "wb") == 0)
250 fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
254 // open BGZF stream on a file
255 if ( (filename != "stdin") && (filename != "stdout") && (filename != "-"))
256 Stream = fopen(filename.c_str(), mode);
258 // open BGZF stream on stdin
259 else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) )
260 Stream = freopen(NULL, mode, stdin);
262 // open BGZF stream on stdout
263 else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
264 Stream = freopen(NULL, mode, stdout);
267 fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
271 // set flag & return success
276 // reads BGZF data into a byte buffer
277 int BgzfStream::Read(char* data, const unsigned int dataLength) {
279 // if stream not open for reading (or empty request)
280 if ( !IsOpen || IsWriteOnly || dataLength == 0 )
283 // read blocks as needed until desired data length is retrieved
285 unsigned int numBytesRead = 0;
286 while ( numBytesRead < dataLength ) {
288 // determine bytes available in current block
289 int bytesAvailable = BlockLength - BlockOffset;
291 // read (and decompress) next block if needed
292 if ( bytesAvailable <= 0 ) {
293 if ( !ReadBlock() ) return -1;
294 bytesAvailable = BlockLength - BlockOffset;
295 if ( bytesAvailable <= 0 ) break;
298 // copy data from uncompressed source buffer into data destination buffer
299 char* buffer = UncompressedBlock;
300 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
301 memcpy(output, buffer + BlockOffset, copyLength);
304 BlockOffset += copyLength;
305 output += copyLength;
306 numBytesRead += copyLength;
310 if ( BlockOffset == BlockLength ) {
311 BlockAddress = ftell64(Stream);
319 // reads a BGZF block
320 bool BgzfStream::ReadBlock(void) {
322 char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
323 int64_t blockAddress = ftell64(Stream);
325 // read block header from file
326 int count = fread(header, 1, sizeof(header), Stream);
328 // if block header empty
334 // if block header invalid size
335 if ( count != sizeof(header) ) {
336 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
340 // validate block header contents
341 if ( !BgzfStream::CheckBlockHeader(header) ) {
342 fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
346 // copy header contents to compressed buffer
347 int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
348 char* compressedBlock = CompressedBlock;
349 memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
350 int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
352 // read remainder of block
353 count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream);
354 if ( count != remaining ) {
355 fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
359 // decompress block data
360 count = InflateBlock(blockLength);
362 fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
367 if ( BlockLength != 0 )
369 BlockAddress = blockAddress;
376 // seek to position in BGZF file
377 bool BgzfStream::Seek(const int64_t& position) {
380 if ( !IsOpen ) return false;
382 // determine adjusted offset & address
383 int blockOffset = (position & 0xFFFF);
384 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
386 // attempt seek in file
387 if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
388 fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n");
392 // update block data & return success
394 BlockAddress = blockAddress;
395 BlockOffset = blockOffset;
399 void BgzfStream::SetWriteCompressed(bool ok) {
400 IsWriteCompressed = ok;
403 // get file position in BGZF file
404 int64_t BgzfStream::Tell(void) const {
407 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
410 // writes the supplied data into the BGZF buffer
411 unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) {
413 // skip if file not open for writing
414 if ( !IsOpen || !IsWriteOnly ) return false;
416 // write blocks as needed til all data is written
417 unsigned int numBytesWritten = 0;
418 const char* input = data;
419 unsigned int blockLength = UncompressedBlockSize;
420 while ( numBytesWritten < dataLen ) {
422 // copy data contents to uncompressed output buffer
423 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
424 char* buffer = UncompressedBlock;
425 memcpy(buffer + BlockOffset, input, copyLength);
428 BlockOffset += copyLength;
430 numBytesWritten += copyLength;
432 // flush (& compress) output buffer when full
433 if ( BlockOffset == blockLength ) FlushBlock();
437 return numBytesWritten;