1 // ***************************************************************************
\r
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Strömberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 11 January 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading & writing BGZF files
\r
12 // ***************************************************************************
\r
14 #include <algorithm>
\r
16 using namespace BamTools;
\r
20 BgzfData::BgzfData(void)
\r
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
27 , IsWriteOnly(false)
\r
29 , UncompressedBlock(NULL)
\r
30 , CompressedBlock(NULL)
\r
33 CompressedBlock = new char[CompressedBlockSize];
\r
34 UncompressedBlock = new char[UncompressedBlockSize];
\r
35 } catch( std::bad_alloc& ba ) {
\r
36 printf("ERROR: Unable to allocate memory for our BGZF object.\n");
\r
42 BgzfData::~BgzfData(void) {
\r
43 if(CompressedBlock) { delete[] CompressedBlock; }
\r
44 if(UncompressedBlock) { delete[] UncompressedBlock; }
\r
48 void BgzfData::Close(void) {
\r
50 // skip if file not open, otherwise set flag
\r
51 if (!IsOpen) { return; }
\r
54 // flush the current BGZF block
\r
55 if (IsWriteOnly) { FlushBlock(); }
\r
57 // write an empty block (as EOF marker)
\r
58 int blockLength = DeflateBlock();
\r
59 fwrite(CompressedBlock, 1, blockLength, Stream);
\r
66 // compresses the current block
\r
67 int BgzfData::DeflateBlock(void) {
\r
69 // initialize the gzip header
\r
70 char* buffer = CompressedBlock;
\r
71 memset(buffer, 0, 18);
\r
72 buffer[0] = GZIP_ID1;
\r
73 buffer[1] = (char)GZIP_ID2;
\r
74 buffer[2] = CM_DEFLATE;
\r
75 buffer[3] = FLG_FEXTRA;
\r
76 buffer[9] = (char)OS_UNKNOWN;
\r
77 buffer[10] = BGZF_XLEN;
\r
78 buffer[12] = BGZF_ID1;
\r
79 buffer[13] = BGZF_ID2;
\r
80 buffer[14] = BGZF_LEN;
\r
82 // loop to retry for blocks that do not compress enough
\r
83 int inputLength = BlockOffset;
\r
84 int compressedLength = 0;
\r
85 unsigned int bufferSize = CompressedBlockSize;
\r
89 // initialize zstream values
\r
93 zs.next_in = (Bytef*)UncompressedBlock;
\r
94 zs.avail_in = inputLength;
\r
95 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
\r
96 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
\r
98 // initialize the zlib compression algorithm
\r
99 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
\r
100 printf("ERROR: zlib deflate initialization failed.\n");
\r
104 // compress the data
\r
105 int status = deflate(&zs, Z_FINISH);
\r
106 if(status != Z_STREAM_END) {
\r
110 // reduce the input length and try again
\r
111 if(status == Z_OK) {
\r
112 inputLength -= 1024;
\r
113 if(inputLength < 0) {
\r
114 printf("ERROR: input reduction failed.\n");
\r
120 printf("ERROR: zlib deflate failed.\n");
\r
124 // finalize the compression routine
\r
125 if(deflateEnd(&zs) != Z_OK) {
\r
126 printf("ERROR: deflate end failed.\n");
\r
130 compressedLength = zs.total_out;
\r
131 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
\r
133 if(compressedLength > MAX_BLOCK_SIZE) {
\r
134 printf("ERROR: deflate overflow.\n");
\r
141 // store the compressed length
\r
142 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
\r
144 // store the CRC32 checksum
\r
145 unsigned int crc = crc32(0, NULL, 0);
\r
146 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
\r
147 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
\r
148 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
\r
150 // ensure that we have less than a block of data left
\r
151 int remaining = BlockOffset - inputLength;
\r
152 if(remaining > 0) {
\r
153 if(remaining > inputLength) {
\r
154 printf("ERROR: remainder too large.\n");
\r
157 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
\r
160 BlockOffset = remaining;
\r
161 return compressedLength;
\r
164 // flushes the data in the BGZF block
\r
165 void BgzfData::FlushBlock(void) {
\r
167 // flush all of the remaining blocks
\r
168 while(BlockOffset > 0) {
\r
170 // compress the data block
\r
171 int blockLength = DeflateBlock();
\r
173 // flush the data to our output stream
\r
174 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
\r
176 if(numBytesWritten != blockLength) {
\r
177 printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
\r
181 BlockAddress += blockLength;
\r
185 // de-compresses the current block
\r
186 int BgzfData::InflateBlock(const int& blockLength) {
\r
188 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
\r
192 zs.next_in = (Bytef*)CompressedBlock + 18;
\r
193 zs.avail_in = blockLength - 16;
\r
194 zs.next_out = (Bytef*)UncompressedBlock;
\r
195 zs.avail_out = UncompressedBlockSize;
\r
197 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
\r
198 if (status != Z_OK) {
\r
199 printf("inflateInit failed\n");
\r
203 status = inflate(&zs, Z_FINISH);
\r
204 if (status != Z_STREAM_END) {
\r
206 printf("inflate failed\n");
\r
210 status = inflateEnd(&zs);
\r
211 if (status != Z_OK) {
\r
212 printf("inflateEnd failed\n");
\r
216 return zs.total_out;
\r
219 void BgzfData::Open(const string& filename, const char* mode) {
\r
221 // determine open mode
\r
222 if ( strcmp(mode, "rb") == 0 ) {
\r
223 IsWriteOnly = false;
\r
224 } else if ( strcmp(mode, "wb") == 0) {
\r
225 IsWriteOnly = true;
\r
227 printf("ERROR: Unknown file mode: %s\n", mode);
\r
231 // open Stream to read to/write from file, stdin, or stdout
\r
232 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
\r
233 if ( (filename != "stdin") && (filename != "stdout") ) {
\r
234 // read/wrtie BGZF data to/from a file
\r
235 Stream = fopen(filename.c_str(), mode);
\r
237 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) {
\r
238 // read BGZF data from stdin
\r
239 Stream = freopen(NULL, mode, stdin);
\r
241 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) {
\r
242 // write BGZF data to stdout
\r
243 Stream = freopen(NULL, mode, stdout);
\r
247 printf("ERROR: Unable to open the BAM file %s\n", filename.c_str() );
\r
253 int BgzfData::Read(char* data, const unsigned int dataLength) {
\r
255 if (dataLength == 0) { return 0; }
\r
257 char* output = data;
\r
258 unsigned int numBytesRead = 0;
\r
259 while (numBytesRead < dataLength) {
\r
261 int bytesAvailable = BlockLength - BlockOffset;
\r
262 if (bytesAvailable <= 0) {
\r
263 if ( ReadBlock() != 0 ) { return -1; }
\r
264 bytesAvailable = BlockLength - BlockOffset;
\r
265 if ( bytesAvailable <= 0 ) { break; }
\r
268 char* buffer = UncompressedBlock;
\r
269 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
\r
270 memcpy(output, buffer + BlockOffset, copyLength);
\r
272 BlockOffset += copyLength;
\r
273 output += copyLength;
\r
274 numBytesRead += copyLength;
\r
277 if ( BlockOffset == BlockLength ) {
\r
278 BlockAddress = ftell(Stream);
\r
283 return numBytesRead;
\r
286 int BgzfData::ReadBlock(void) {
\r
288 char header[BLOCK_HEADER_LENGTH];
\r
289 int64_t blockAddress = ftell(Stream);
\r
291 int count = fread(header, 1, sizeof(header), Stream);
\r
297 if (count != sizeof(header)) {
\r
298 printf("read block failed - count != sizeof(header)\n");
\r
302 if (!BgzfData::CheckBlockHeader(header)) {
\r
303 printf("read block failed - CheckBlockHeader() returned false\n");
\r
307 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
\r
308 char* compressedBlock = CompressedBlock;
\r
309 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
\r
310 int remaining = blockLength - BLOCK_HEADER_LENGTH;
\r
312 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
\r
313 if (count != remaining) {
\r
314 printf("read block failed - count != remaining\n");
\r
318 count = InflateBlock(blockLength);
\r
319 if (count < 0) { return -1; }
\r
321 if ( BlockLength != 0 ) {
\r
325 BlockAddress = blockAddress;
\r
326 BlockLength = count;
\r
330 bool BgzfData::Seek(int64_t position) {
\r
332 int blockOffset = (position & 0xFFFF);
\r
333 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
\r
335 if (fseek(Stream, blockAddress, SEEK_SET) != 0) {
\r
336 printf("ERROR: Unable to seek in BAM file\n");
\r
341 BlockAddress = blockAddress;
\r
342 BlockOffset = blockOffset;
\r
346 int64_t BgzfData::Tell(void) {
\r
347 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
\r
350 // writes the supplied data into the BGZF buffer
\r
351 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
\r
354 unsigned int numBytesWritten = 0;
\r
355 const char* input = data;
\r
356 unsigned int blockLength = UncompressedBlockSize;
\r
358 // copy the data to the buffer
\r
359 while(numBytesWritten < dataLen) {
\r
360 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
\r
361 char* buffer = UncompressedBlock;
\r
362 memcpy(buffer + BlockOffset, input, copyLength);
\r
364 BlockOffset += copyLength;
\r
365 input += copyLength;
\r
366 numBytesWritten += copyLength;
\r
368 if(BlockOffset == blockLength) {
\r
373 return numBytesWritten;
\r