1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Strömberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 8 December 2009 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(void)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
29 , UncompressedBlock(NULL)
\r
30 , CompressedBlock(NULL)
\r
33 CompressedBlock = new char[CompressedBlockSize];
\r
34 UncompressedBlock = new char[UncompressedBlockSize];
\r
35 } catch( std::bad_alloc& ba ) {
\r
36 printf("ERROR: Unable to allocate memory for our BGZF object.\n");
\r
42 BgzfData::~BgzfData(void) {
\r
43 if(CompressedBlock) delete [] CompressedBlock;
\r
44 if(UncompressedBlock) delete [] UncompressedBlock;
\r
48 void BgzfData::Close(void) {
\r
50 if (!IsOpen) { return; }
\r
53 // flush the BGZF block
\r
54 if ( IsWriteOnly ) { FlushBlock(); }
\r
61 // compresses the current block
\r
62 int BgzfData::DeflateBlock(void) {
\r
64 // initialize the gzip header
\r
65 char* buffer = CompressedBlock;
\r
66 unsigned int bufferSize = CompressedBlockSize;
\r
68 memset(buffer, 0, 18);
\r
69 buffer[0] = GZIP_ID1;
\r
70 buffer[1] = (char)GZIP_ID2;
\r
71 buffer[2] = CM_DEFLATE;
\r
72 buffer[3] = FLG_FEXTRA;
\r
73 buffer[9] = (char)OS_UNKNOWN;
\r
74 buffer[10] = BGZF_XLEN;
\r
75 buffer[12] = BGZF_ID1;
\r
76 buffer[13] = BGZF_ID2;
\r
77 buffer[14] = BGZF_LEN;
\r
79 // loop to retry for blocks that do not compress enough
\r
80 int inputLength = BlockOffset;
\r
81 int compressedLength = 0;
\r
88 zs.next_in = (Bytef*)UncompressedBlock;
\r
89 zs.avail_in = inputLength;
\r
90 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
91 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
93 // initialize the zlib compression algorithm
\r
94 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
\r
95 printf("ERROR: zlib deflate initialization failed.\n");
\r
99 // compress the data
\r
100 int status = deflate(&zs, Z_FINISH);
\r
101 if(status != Z_STREAM_END) {
\r
105 // reduce the input length and try again
\r
106 if(status == Z_OK) {
\r
107 inputLength -= 1024;
\r
108 if(inputLength < 0) {
\r
109 printf("ERROR: input reduction failed.\n");
\r
115 printf("ERROR: zlib deflate failed.\n");
\r
119 // finalize the compression routine
\r
120 if(deflateEnd(&zs) != Z_OK) {
\r
121 printf("ERROR: deflate end failed.\n");
\r
125 compressedLength = zs.total_out;
\r
126 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
128 if(compressedLength > MAX_BLOCK_SIZE) {
\r
129 printf("ERROR: deflate overflow.\n");
\r
136 // store the compressed length
\r
137 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
139 // store the CRC32 checksum
\r
140 unsigned int crc = crc32(0, NULL, 0);
\r
141 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
142 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
143 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
145 // ensure that we have less than a block of data left
\r
146 int remaining = BlockOffset - inputLength;
\r
147 if(remaining > 0) {
\r
148 if(remaining > inputLength) {
\r
149 printf("ERROR: remainder too large.\n");
\r
152 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
155 BlockOffset = remaining;
\r
156 return compressedLength;
\r
159 // flushes the data in the BGZF block
\r
160 void BgzfData::FlushBlock(void) {
\r
162 // flush all of the remaining blocks
\r
163 while(BlockOffset > 0) {
\r
165 // compress the data block
\r
166 int blockLength = DeflateBlock();
\r
168 // flush the data to our output stream
\r
169 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
171 if(numBytesWritten != blockLength) {
\r
172 printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
176 BlockAddress += blockLength;
\r
180 // de-compresses the current block
\r
181 int BgzfData::InflateBlock(const int& blockLength) {
\r
183 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
187 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
188 zs.avail_in = blockLength - 16;
\r
189 zs.next_out = (Bytef*)UncompressedBlock;
\r
190 zs.avail_out = UncompressedBlockSize;
\r
192 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
193 if (status != Z_OK) {
\r
194 printf("inflateInit failed\n");
\r
198 status = inflate(&zs, Z_FINISH);
\r
199 if (status != Z_STREAM_END) {
\r
201 printf("inflate failed\n");
\r
205 status = inflateEnd(&zs);
\r
206 if (status != Z_OK) {
\r
207 printf("inflateEnd failed\n");
\r
211 return zs.total_out;
\r
214 void BgzfData::Open(const string& filename, const char* mode) {
\r
216 if ( strcmp(mode, "rb") == 0 ) {
\r
217 IsWriteOnly = false;
\r
218 } else if ( strcmp(mode, "wb") == 0) {
\r
219 IsWriteOnly = true;
\r
221 printf("ERROR: Unknown file mode: %s\n", mode);
\r
225 Stream = fopen(filename.c_str(), mode);
\r
227 printf("ERROR: Unable to open the BAM file %s\n", filename.c_str() );
\r
233 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
235 if (dataLength == 0) { return 0; }
\r
237 char* output = data;
\r
238 unsigned int numBytesRead = 0;
\r
239 while (numBytesRead < dataLength) {
\r
241 int bytesAvailable = BlockLength - BlockOffset;
\r
242 if (bytesAvailable <= 0) {
\r
243 if ( ReadBlock() != 0 ) { return -1; }
\r
244 bytesAvailable = BlockLength - BlockOffset;
\r
245 if ( bytesAvailable <= 0 ) { break; }
\r
248 char* buffer = UncompressedBlock;
\r
249 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
250 memcpy(output, buffer + BlockOffset, copyLength);
\r
252 BlockOffset += copyLength;
\r
253 output += copyLength;
\r
254 numBytesRead += copyLength;
\r
257 if ( BlockOffset == BlockLength ) {
\r
258 BlockAddress = ftell(Stream);
\r
263 return numBytesRead;
\r
266 int BgzfData::ReadBlock(void) {
\r
268 char header[BLOCK_HEADER_LENGTH];
\r
269 int64_t blockAddress = ftell(Stream);
\r
271 int count = fread(header, 1, sizeof(header), Stream);
\r
277 if (count != sizeof(header)) {
\r
278 printf("read block failed - count != sizeof(header)\n");
\r
282 if (!BgzfData::CheckBlockHeader(header)) {
\r
283 printf("read block failed - CheckBlockHeader() returned false\n");
\r
287 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
288 char* compressedBlock = CompressedBlock;
\r
289 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
290 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
292 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
293 if (count != remaining) {
\r
294 printf("read block failed - count != remaining\n");
\r
298 count = InflateBlock(blockLength);
\r
299 if (count < 0) { return -1; }
\r
301 if ( BlockLength != 0 ) {
\r
305 BlockAddress = blockAddress;
\r
306 BlockLength = count;
\r
310 bool BgzfData::Seek(int64_t position) {
\r
312 int blockOffset = (position & 0xFFFF);
\r
313 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
315 if (fseek(Stream, blockAddress, SEEK_SET) != 0) {
\r
316 printf("ERROR: Unable to seek in BAM file\n");
\r
321 BlockAddress = blockAddress;
\r
322 BlockOffset = blockOffset;
\r
326 int64_t BgzfData::Tell(void) {
\r
327 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
330 // writes the supplied data into the BGZF buffer
\r
331 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
334 unsigned int numBytesWritten = 0;
\r
335 const char* input = data;
\r
336 unsigned int blockLength = UncompressedBlockSize;
\r
338 // copy the data to the buffer
\r
339 while(numBytesWritten < dataLen) {
\r
340 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
341 char* buffer = UncompressedBlock;
\r
342 memcpy(buffer + BlockOffset, input, copyLength);
\r
344 BlockOffset += copyLength;
\r
345 input += copyLength;
\r
346 numBytesWritten += copyLength;
\r
348 if(BlockOffset == blockLength) {
\r
353 return numBytesWritten;
\r