1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 9 July 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(void)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
29 , UncompressedBlock(NULL)
\r
30 , CompressedBlock(NULL)
\r
33 CompressedBlock = new char[CompressedBlockSize];
\r
34 UncompressedBlock = new char[UncompressedBlockSize];
\r
35 } catch( std::bad_alloc& ba ) {
\r
36 printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");
\r
42 BgzfData::~BgzfData(void) {
\r
43 if( CompressedBlock ) { delete[] CompressedBlock; }
\r
44 if( UncompressedBlock ) { delete[] UncompressedBlock; }
\r
48 void BgzfData::Close(void) {
\r
50 // skip if file not open, otherwise set flag
\r
51 if ( !IsOpen ) return;
\r
54 // flush the current BGZF block
\r
55 if ( IsWriteOnly ) FlushBlock();
\r
57 // write an empty block (as EOF marker)
\r
58 int blockLength = DeflateBlock();
\r
59 fwrite(CompressedBlock, 1, blockLength, Stream);
\r
66 // compresses the current block
\r
67 int BgzfData::DeflateBlock(void) {
\r
69 // initialize the gzip header
\r
70 char* buffer = CompressedBlock;
\r
71 memset(buffer, 0, 18);
\r
72 buffer[0] = GZIP_ID1;
\r
73 buffer[1] = (char)GZIP_ID2;
\r
74 buffer[2] = CM_DEFLATE;
\r
75 buffer[3] = FLG_FEXTRA;
\r
76 buffer[9] = (char)OS_UNKNOWN;
\r
77 buffer[10] = BGZF_XLEN;
\r
78 buffer[12] = BGZF_ID1;
\r
79 buffer[13] = BGZF_ID2;
\r
80 buffer[14] = BGZF_LEN;
\r
82 // loop to retry for blocks that do not compress enough
\r
83 int inputLength = BlockOffset;
\r
84 int compressedLength = 0;
\r
85 unsigned int bufferSize = CompressedBlockSize;
\r
89 // initialize zstream values
\r
93 zs.next_in = (Bytef*)UncompressedBlock;
\r
94 zs.avail_in = inputLength;
\r
95 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
96 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
98 // initialize the zlib compression algorithm
\r
99 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
\r
100 printf("BGZF ERROR: zlib deflate initialization failed.\n");
\r
104 // compress the data
\r
105 int status = deflate(&zs, Z_FINISH);
\r
106 if(status != Z_STREAM_END) {
\r
110 // reduce the input length and try again
\r
111 if(status == Z_OK) {
\r
112 inputLength -= 1024;
\r
113 if(inputLength < 0) {
\r
114 printf("BGZF ERROR: input reduction failed.\n");
\r
120 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
124 // finalize the compression routine
\r
125 if(deflateEnd(&zs) != Z_OK) {
\r
126 printf("BGZF ERROR: zlib::deflateEnd() failed.\n");
\r
130 compressedLength = zs.total_out;
\r
131 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
132 if(compressedLength > MAX_BLOCK_SIZE) {
\r
133 printf("BGZF ERROR: deflate overflow.\n");
\r
140 // store the compressed length
\r
141 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
143 // store the CRC32 checksum
\r
144 unsigned int crc = crc32(0, NULL, 0);
\r
145 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
146 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
147 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
149 // ensure that we have less than a block of data left
\r
150 int remaining = BlockOffset - inputLength;
\r
151 if(remaining > 0) {
\r
152 if(remaining > inputLength) {
\r
153 printf("BGZF ERROR: after deflate, remainder too large.\n");
\r
156 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
159 BlockOffset = remaining;
\r
160 return compressedLength;
\r
163 // flushes the data in the BGZF block
\r
164 void BgzfData::FlushBlock(void) {
\r
166 // flush all of the remaining blocks
\r
167 while(BlockOffset > 0) {
\r
169 // compress the data block
\r
170 int blockLength = DeflateBlock();
\r
172 // flush the data to our output stream
\r
173 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
175 if(numBytesWritten != blockLength) {
\r
176 printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
180 BlockAddress += blockLength;
\r
184 // de-compresses the current block
\r
185 int BgzfData::InflateBlock(const int& blockLength) {
\r
187 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
191 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
192 zs.avail_in = blockLength - 16;
\r
193 zs.next_out = (Bytef*)UncompressedBlock;
\r
194 zs.avail_out = UncompressedBlockSize;
\r
196 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
197 if (status != Z_OK) {
\r
198 printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
\r
202 status = inflate(&zs, Z_FINISH);
\r
203 if (status != Z_STREAM_END) {
\r
205 printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
\r
209 status = inflateEnd(&zs);
\r
210 if (status != Z_OK) {
\r
211 printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
\r
215 return zs.total_out;
\r
218 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
\r
219 bool BgzfData::Open(const string& filename, const char* mode) {
\r
221 // determine open mode
\r
222 if ( strcmp(mode, "rb") == 0 ) {
\r
223 IsWriteOnly = false;
\r
224 } else if ( strcmp(mode, "wb") == 0) {
\r
225 IsWriteOnly = true;
\r
227 printf("BGZF ERROR: unknown file mode: %s\n", mode);
\r
231 // open Stream to read to/write from file, stdin, or stdout
\r
232 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
\r
233 if ( (filename != "stdin") && (filename != "stdout") ) {
\r
234 // read/write BGZF data to/from a file
\r
235 Stream = fopen64(filename.c_str(), mode);
\r
237 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) {
\r
238 // read BGZF data from stdin
\r
239 Stream = freopen64(NULL, mode, stdin);
\r
241 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) {
\r
242 // write BGZF data to stdout
\r
243 Stream = freopen64(NULL, mode, stdout);
\r
247 printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );
\r
251 // set flag, return success
\r
256 // reads BGZF data into a byte buffer
\r
257 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
259 if (dataLength == 0) return 0;
\r
261 char* output = data;
\r
262 unsigned int numBytesRead = 0;
\r
263 while (numBytesRead < dataLength) {
\r
265 int bytesAvailable = BlockLength - BlockOffset;
\r
266 if ( bytesAvailable <= 0 ) {
\r
267 if (!ReadBlock()) return -1;
\r
268 bytesAvailable = BlockLength - BlockOffset;
\r
269 if (bytesAvailable <= 0) break;
\r
272 char* buffer = UncompressedBlock;
\r
273 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
274 memcpy(output, buffer + BlockOffset, copyLength);
\r
276 BlockOffset += copyLength;
\r
277 output += copyLength;
\r
278 numBytesRead += copyLength;
\r
281 if ( BlockOffset == BlockLength ) {
\r
282 BlockAddress = ftello(Stream);
\r
287 return numBytesRead;
\r
290 // reads a BGZF block
\r
291 bool BgzfData::ReadBlock(void) {
\r
293 char header[BLOCK_HEADER_LENGTH];
\r
294 int64_t blockAddress = ftello(Stream);
\r
296 int count = fread(header, 1, sizeof(header), Stream);
\r
302 if (count != sizeof(header)) {
\r
303 printf("BGZF ERROR: read block failed - could not read block header\n");
\r
307 if (!BgzfData::CheckBlockHeader(header)) {
\r
308 printf("BGZF ERROR: read block failed - invalid block header\n");
\r
312 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
313 char* compressedBlock = CompressedBlock;
\r
314 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
315 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
317 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
318 if (count != remaining) {
\r
319 printf("BGZF ERROR: read block failed - could not read data from block\n");
\r
323 count = InflateBlock(blockLength);
\r
325 printf("BGZF ERROR: read block failed - could not decompress block data\n");
\r
329 if ( BlockLength != 0 )
\r
332 BlockAddress = blockAddress;
\r
333 BlockLength = count;
\r
337 // seek to position in BGZF file
\r
338 bool BgzfData::Seek(int64_t position) {
\r
340 int blockOffset = (position & 0xFFFF);
\r
341 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
343 if (fseeko(Stream, blockAddress, SEEK_SET) != 0) {
\r
344 printf("BGZF ERROR: unable to seek in file\n");
\r
349 BlockAddress = blockAddress;
\r
350 BlockOffset = blockOffset;
\r
354 // get file position in BGZF file
\r
355 int64_t BgzfData::Tell(void) {
\r
356 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
359 // writes the supplied data into the BGZF buffer
\r
360 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
363 unsigned int numBytesWritten = 0;
\r
364 const char* input = data;
\r
365 unsigned int blockLength = UncompressedBlockSize;
\r
367 // copy the data to the buffer
\r
368 while(numBytesWritten < dataLen) {
\r
370 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
371 char* buffer = UncompressedBlock;
\r
372 memcpy(buffer + BlockOffset, input, copyLength);
\r
374 BlockOffset += copyLength;
\r
375 input += copyLength;
\r
376 numBytesWritten += copyLength;
\r
378 if(BlockOffset == blockLength)
\r
382 return numBytesWritten;
\r