1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 19 July 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(void)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
29 , UncompressedBlock(NULL)
\r
30 , CompressedBlock(NULL)
\r
33 CompressedBlock = new char[CompressedBlockSize];
\r
34 UncompressedBlock = new char[UncompressedBlockSize];
\r
35 } catch( std::bad_alloc& ba ) {
\r
36 printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");
\r
42 BgzfData::~BgzfData(void) {
\r
43 if( CompressedBlock ) { delete[] CompressedBlock; }
\r
44 if( UncompressedBlock ) { delete[] UncompressedBlock; }
\r
48 void BgzfData::Close(void) {
\r
50 // skip if file not open, otherwise set flag
\r
51 if ( !IsOpen ) return;
\r
53 // if writing to file, flush the current BGZF block,
\r
54 // then write an empty block (as EOF marker)
\r
55 if ( IsWriteOnly ) {
\r
57 int blockLength = DeflateBlock();
\r
58 fwrite(CompressedBlock, 1, blockLength, Stream);
\r
67 // compresses the current block
\r
68 int BgzfData::DeflateBlock(void) {
\r
70 // initialize the gzip header
\r
71 char* buffer = CompressedBlock;
\r
72 memset(buffer, 0, 18);
\r
73 buffer[0] = GZIP_ID1;
\r
74 buffer[1] = (char)GZIP_ID2;
\r
75 buffer[2] = CM_DEFLATE;
\r
76 buffer[3] = FLG_FEXTRA;
\r
77 buffer[9] = (char)OS_UNKNOWN;
\r
78 buffer[10] = BGZF_XLEN;
\r
79 buffer[12] = BGZF_ID1;
\r
80 buffer[13] = BGZF_ID2;
\r
81 buffer[14] = BGZF_LEN;
\r
83 // loop to retry for blocks that do not compress enough
\r
84 int inputLength = BlockOffset;
\r
85 int compressedLength = 0;
\r
86 unsigned int bufferSize = CompressedBlockSize;
\r
90 // initialize zstream values
\r
94 zs.next_in = (Bytef*)UncompressedBlock;
\r
95 zs.avail_in = inputLength;
\r
96 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
97 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
99 // initialize the zlib compression algorithm
\r
100 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
\r
101 printf("BGZF ERROR: zlib deflate initialization failed.\n");
\r
105 // compress the data
\r
106 int status = deflate(&zs, Z_FINISH);
\r
107 if(status != Z_STREAM_END) {
\r
111 // reduce the input length and try again
\r
112 if(status == Z_OK) {
\r
113 inputLength -= 1024;
\r
114 if(inputLength < 0) {
\r
115 printf("BGZF ERROR: input reduction failed.\n");
\r
121 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
125 // finalize the compression routine
\r
126 if(deflateEnd(&zs) != Z_OK) {
\r
127 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
131 compressedLength = zs.total_out;
\r
132 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
133 if(compressedLength > MAX_BLOCK_SIZE) {
\r
134 printf("BGZF ERROR: deflate overflow.\n");
\r
141 // store the compressed length
\r
142 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
144 // store the CRC32 checksum
\r
145 unsigned int crc = crc32(0, NULL, 0);
\r
146 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
147 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
148 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
150 // ensure that we have less than a block of data left
\r
151 int remaining = BlockOffset - inputLength;
\r
152 if(remaining > 0) {
\r
153 if(remaining > inputLength) {
\r
154 printf("BGZF ERROR: after deflate, remainder too large.\n");
\r
157 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
160 BlockOffset = remaining;
\r
161 return compressedLength;
\r
164 // flushes the data in the BGZF block
\r
165 void BgzfData::FlushBlock(void) {
\r
167 // flush all of the remaining blocks
\r
168 while(BlockOffset > 0) {
\r
170 // compress the data block
\r
171 int blockLength = DeflateBlock();
\r
173 // flush the data to our output stream
\r
174 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
176 if(numBytesWritten != blockLength) {
\r
177 printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
181 BlockAddress += blockLength;
\r
185 // de-compresses the current block
\r
186 int BgzfData::InflateBlock(const int& blockLength) {
\r
188 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
192 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
193 zs.avail_in = blockLength - 16;
\r
194 zs.next_out = (Bytef*)UncompressedBlock;
\r
195 zs.avail_out = UncompressedBlockSize;
\r
197 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
198 if (status != Z_OK) {
\r
199 printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
\r
203 status = inflate(&zs, Z_FINISH);
\r
204 if (status != Z_STREAM_END) {
\r
206 printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
\r
210 status = inflateEnd(&zs);
\r
211 if (status != Z_OK) {
\r
212 printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
\r
216 return zs.total_out;
\r
219 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
\r
220 bool BgzfData::Open(const string& filename, const char* mode) {
\r
222 // determine open mode
\r
223 if ( strcmp(mode, "rb") == 0 ) {
\r
224 IsWriteOnly = false;
\r
225 } else if ( strcmp(mode, "wb") == 0) {
\r
226 IsWriteOnly = true;
\r
228 printf("BGZF ERROR: unknown file mode: %s\n", mode);
\r
232 // open Stream to read to/write from file, stdin, or stdout
\r
233 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
\r
234 if ( (filename != "stdin") && (filename != "stdout") ) {
\r
235 // read/write BGZF data to/from a file
\r
236 // Stream = fopen64(filename.c_str(), mode);
\r
237 Stream = fopen(filename.c_str(), mode);
\r
239 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) {
\r
240 // read BGZF data from stdin
\r
241 // Stream = freopen64(NULL, mode, stdin);
\r
242 Stream = freopen(NULL, mode, stdin);
\r
244 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) {
\r
245 // write BGZF data to stdout
\r
246 // Stream = freopen64(NULL, mode, stdout);
\r
247 Stream = freopen(NULL, mode, stdout);
\r
251 printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );
\r
255 // set flag, return success
\r
260 // reads BGZF data into a byte buffer
\r
261 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
263 if (dataLength == 0) return 0;
\r
265 char* output = data;
\r
266 unsigned int numBytesRead = 0;
\r
267 while (numBytesRead < dataLength) {
\r
269 int bytesAvailable = BlockLength - BlockOffset;
\r
270 if ( bytesAvailable <= 0 ) {
\r
271 if (!ReadBlock()) return -1;
\r
272 bytesAvailable = BlockLength - BlockOffset;
\r
273 if (bytesAvailable <= 0) break;
\r
276 char* buffer = UncompressedBlock;
\r
277 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
278 memcpy(output, buffer + BlockOffset, copyLength);
\r
280 BlockOffset += copyLength;
\r
281 output += copyLength;
\r
282 numBytesRead += copyLength;
\r
285 if ( BlockOffset == BlockLength ) {
\r
286 BlockAddress = ftell64(Stream);
\r
291 return numBytesRead;
\r
294 // reads a BGZF block
\r
295 bool BgzfData::ReadBlock(void) {
\r
297 char header[BLOCK_HEADER_LENGTH];
\r
298 int64_t blockAddress = ftell64(Stream);
\r
300 int count = fread(header, 1, sizeof(header), Stream);
\r
306 if (count != sizeof(header)) {
\r
307 printf("BGZF ERROR: read block failed - could not read block header\n");
\r
311 if (!BgzfData::CheckBlockHeader(header)) {
\r
312 printf("BGZF ERROR: read block failed - invalid block header\n");
\r
316 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
317 char* compressedBlock = CompressedBlock;
\r
318 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
319 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
321 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
322 if (count != remaining) {
\r
323 printf("BGZF ERROR: read block failed - could not read data from block\n");
\r
327 count = InflateBlock(blockLength);
\r
329 printf("BGZF ERROR: read block failed - could not decompress block data\n");
\r
333 if ( BlockLength != 0 )
\r
336 BlockAddress = blockAddress;
\r
337 BlockLength = count;
\r
341 // seek to position in BGZF file
\r
342 bool BgzfData::Seek(int64_t position) {
\r
344 int blockOffset = (position & 0xFFFF);
\r
345 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
347 if (fseek64(Stream, blockAddress, SEEK_SET) != 0) {
\r
348 printf("BGZF ERROR: unable to seek in file\n");
\r
353 BlockAddress = blockAddress;
\r
354 BlockOffset = blockOffset;
\r
358 // get file position in BGZF file
\r
359 int64_t BgzfData::Tell(void) {
\r
360 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
363 // writes the supplied data into the BGZF buffer
\r
364 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
367 unsigned int numBytesWritten = 0;
\r
368 const char* input = data;
\r
369 unsigned int blockLength = UncompressedBlockSize;
\r
371 // copy the data to the buffer
\r
372 while(numBytesWritten < dataLen) {
\r
374 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
375 char* buffer = UncompressedBlock;
\r
376 memcpy(buffer + BlockOffset, input, copyLength);
\r
378 BlockOffset += copyLength;
\r
379 input += copyLength;
\r
380 numBytesWritten += copyLength;
\r
382 if(BlockOffset == blockLength)
\r
386 return numBytesWritten;
\r