1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 16 August 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(void)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
28 , IsWriteUncompressed(false)
\r
30 , UncompressedBlock(NULL)
\r
31 , CompressedBlock(NULL)
\r
34 CompressedBlock = new char[CompressedBlockSize];
\r
35 UncompressedBlock = new char[UncompressedBlockSize];
\r
36 } catch( std::bad_alloc& ba ) {
\r
37 fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n");
\r
43 BgzfData::~BgzfData(void) {
\r
44 if( CompressedBlock ) delete[] CompressedBlock;
\r
45 if( UncompressedBlock ) delete[] UncompressedBlock;
\r
49 void BgzfData::Close(void) {
\r
51 // skip if file not open, otherwise set flag
\r
52 if ( !IsOpen ) return;
\r
54 // if writing to file, flush the current BGZF block,
\r
55 // then write an empty block (as EOF marker)
\r
56 if ( IsWriteOnly ) {
\r
58 int blockLength = DeflateBlock();
\r
59 fwrite(CompressedBlock, 1, blockLength, Stream);
\r
65 IsWriteUncompressed = false;
\r
69 // compresses the current block
\r
70 int BgzfData::DeflateBlock(void) {
\r
72 // initialize the gzip header
\r
73 char* buffer = CompressedBlock;
\r
74 memset(buffer, 0, 18);
\r
75 buffer[0] = GZIP_ID1;
\r
76 buffer[1] = (char)GZIP_ID2;
\r
77 buffer[2] = CM_DEFLATE;
\r
78 buffer[3] = FLG_FEXTRA;
\r
79 buffer[9] = (char)OS_UNKNOWN;
\r
80 buffer[10] = BGZF_XLEN;
\r
81 buffer[12] = BGZF_ID1;
\r
82 buffer[13] = BGZF_ID2;
\r
83 buffer[14] = BGZF_LEN;
\r
85 // set compression level
\r
86 const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
\r
88 // loop to retry for blocks that do not compress enough
\r
89 int inputLength = BlockOffset;
\r
90 int compressedLength = 0;
\r
91 unsigned int bufferSize = CompressedBlockSize;
\r
95 // initialize zstream values
\r
99 zs.next_in = (Bytef*)UncompressedBlock;
\r
100 zs.avail_in = inputLength;
\r
101 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
102 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
104 // initialize the zlib compression algorithm
\r
105 if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
\r
106 fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n");
\r
110 // compress the data
\r
111 int status = deflate(&zs, Z_FINISH);
\r
112 if ( status != Z_STREAM_END ) {
\r
116 // reduce the input length and try again
\r
117 if ( status == Z_OK ) {
\r
118 inputLength -= 1024;
\r
119 if( inputLength < 0 ) {
\r
120 fprintf(stderr, "BGZF ERROR: input reduction failed.\n");
\r
126 fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
130 // finalize the compression routine
\r
131 if ( deflateEnd(&zs) != Z_OK ) {
\r
132 fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
136 compressedLength = zs.total_out;
\r
137 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
138 if ( compressedLength > MAX_BLOCK_SIZE ) {
\r
139 fprintf(stderr, "BGZF ERROR: deflate overflow.\n");
\r
146 // store the compressed length
\r
147 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
149 // store the CRC32 checksum
\r
150 unsigned int crc = crc32(0, NULL, 0);
\r
151 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
152 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
153 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
155 // ensure that we have less than a block of data left
\r
156 int remaining = BlockOffset - inputLength;
\r
157 if ( remaining > 0 ) {
\r
158 if ( remaining > inputLength ) {
\r
159 fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n");
\r
162 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
165 BlockOffset = remaining;
\r
166 return compressedLength;
\r
169 // flushes the data in the BGZF block
\r
170 void BgzfData::FlushBlock(void) {
\r
172 // flush all of the remaining blocks
\r
173 while ( BlockOffset > 0 ) {
\r
175 // compress the data block
\r
176 int blockLength = DeflateBlock();
\r
178 // flush the data to our output stream
\r
179 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
181 if ( numBytesWritten != blockLength ) {
\r
182 fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
186 BlockAddress += blockLength;
\r
190 // de-compresses the current block
\r
191 int BgzfData::InflateBlock(const int& blockLength) {
\r
193 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
197 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
198 zs.avail_in = blockLength - 16;
\r
199 zs.next_out = (Bytef*)UncompressedBlock;
\r
200 zs.avail_out = UncompressedBlockSize;
\r
202 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
203 if ( status != Z_OK ) {
\r
204 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
\r
208 status = inflate(&zs, Z_FINISH);
\r
209 if ( status != Z_STREAM_END ) {
\r
211 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
\r
215 status = inflateEnd(&zs);
\r
216 if ( status != Z_OK ) {
\r
217 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
\r
221 return zs.total_out;
\r
224 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
\r
225 bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {
\r
227 // determine open mode
\r
228 if ( strcmp(mode, "rb") == 0 )
\r
229 IsWriteOnly = false;
\r
230 else if ( strcmp(mode, "wb") == 0)
\r
231 IsWriteOnly = true;
\r
233 fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode);
\r
237 // ----------------------------------------------------------------
\r
238 // open Stream to read to/write from file, stdin, or stdout
\r
239 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
\r
241 // read/write BGZF data to/from a file
\r
242 if ( (filename != "stdin") && (filename != "stdout") )
\r
243 Stream = fopen(filename.c_str(), mode);
\r
245 // read BGZF data from stdin
\r
246 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
\r
247 Stream = freopen(NULL, mode, stdin);
\r
249 // write BGZF data to stdout
\r
250 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
\r
251 Stream = freopen(NULL, mode, stdout);
\r
254 fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() );
\r
258 // set flags, return success
\r
260 IsWriteUncompressed = isWriteUncompressed;
\r
264 // reads BGZF data into a byte buffer
\r
265 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
267 if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
\r
269 char* output = data;
\r
270 unsigned int numBytesRead = 0;
\r
271 while ( numBytesRead < dataLength ) {
\r
273 int bytesAvailable = BlockLength - BlockOffset;
\r
274 if ( bytesAvailable <= 0 ) {
\r
275 if ( !ReadBlock() ) return -1;
\r
276 bytesAvailable = BlockLength - BlockOffset;
\r
277 if ( bytesAvailable <= 0 ) break;
\r
280 char* buffer = UncompressedBlock;
\r
281 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
282 memcpy(output, buffer + BlockOffset, copyLength);
\r
284 BlockOffset += copyLength;
\r
285 output += copyLength;
\r
286 numBytesRead += copyLength;
\r
289 if ( BlockOffset == BlockLength ) {
\r
290 BlockAddress = ftell64(Stream);
\r
295 return numBytesRead;
\r
298 // reads a BGZF block
\r
299 bool BgzfData::ReadBlock(void) {
\r
301 char header[BLOCK_HEADER_LENGTH];
\r
302 int64_t blockAddress = ftell64(Stream);
\r
304 int count = fread(header, 1, sizeof(header), Stream);
\r
305 if ( count == 0 ) {
\r
310 if ( count != sizeof(header) ) {
\r
311 fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n");
\r
315 if ( !BgzfData::CheckBlockHeader(header) ) {
\r
316 fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n");
\r
320 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
321 char* compressedBlock = CompressedBlock;
\r
322 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
323 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
325 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
326 if ( count != remaining ) {
\r
327 fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n");
\r
331 count = InflateBlock(blockLength);
\r
332 if ( count < 0 ) {
\r
333 fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n");
\r
337 if ( BlockLength != 0 )
\r
340 BlockAddress = blockAddress;
\r
341 BlockLength = count;
\r
345 // seek to position in BGZF file
\r
346 bool BgzfData::Seek(int64_t position) {
\r
348 if ( !IsOpen ) return false;
\r
350 int blockOffset = (position & 0xFFFF);
\r
351 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
353 if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
\r
354 fprintf(stderr, "BGZF ERROR: unable to seek in file\n");
\r
359 BlockAddress = blockAddress;
\r
360 BlockOffset = blockOffset;
\r
364 // get file position in BGZF file
\r
365 int64_t BgzfData::Tell(void) {
\r
369 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
372 // writes the supplied data into the BGZF buffer
\r
373 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
375 if ( !IsOpen || !IsWriteOnly ) return false;
\r
378 unsigned int numBytesWritten = 0;
\r
379 const char* input = data;
\r
380 unsigned int blockLength = UncompressedBlockSize;
\r
382 // copy the data to the buffer
\r
383 while ( numBytesWritten < dataLen ) {
\r
385 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
386 char* buffer = UncompressedBlock;
\r
387 memcpy(buffer + BlockOffset, input, copyLength);
\r
389 BlockOffset += copyLength;
\r
390 input += copyLength;
\r
391 numBytesWritten += copyLength;
\r
393 if ( BlockOffset == blockLength )
\r
397 return numBytesWritten;
\r