]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BgzfStream_p.cpp
Removed STDERR pollution by API
[bamtools.git] / src / api / internal / BgzfStream_p.cpp
1 // ***************************************************************************
2 // BgzfStream_p.cpp (c) 2011 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 6 October 2011(DB)
6 // ---------------------------------------------------------------------------
7 // Based on BGZF routines developed at the Broad Institute.
8 // Provides the basic functionality for reading & writing BGZF files
9 // Replaces the old BGZF.* files to avoid clashing with other toolkits
10 // ***************************************************************************
11
12 #include <api/internal/BamException_p.h>
13 #include <api/internal/BgzfStream_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
16
17 #include <cstring>
18 #include <algorithm>
19 #include <sstream>
20 using namespace std;
21
22 // ----------------------------
23 // RaiiWrapper implementation
24 // ----------------------------
25
26 BgzfStream::RaiiWrapper::RaiiWrapper(void)
27     : Stream(0)
28 {
29     CompressedBlock   = new char[Constants::BGZF_MAX_BLOCK_SIZE];
30     UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
31 }
32
33 BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
34
35     // clean up buffers
36     delete[] CompressedBlock;
37     delete[] UncompressedBlock;
38     CompressedBlock = 0;
39     UncompressedBlock = 0;
40
41     if ( Stream ) {
42         fflush(Stream);
43         fclose(Stream);
44         Stream = 0;
45     }
46 }
47
48 // ---------------------------
49 // BgzfStream implementation
50 // ---------------------------
51
52 // constructor
53 BgzfStream::BgzfStream(void)
54     : BlockLength(0)
55     , BlockOffset(0)
56     , BlockAddress(0)
57     , IsOpen(false)
58     , IsWriteOnly(false)
59     , IsWriteCompressed(true)
60 { }
61
62 // destructor
63 BgzfStream::~BgzfStream(void) {
64     Close();
65 }
66
67 // checks BGZF block header
68 bool BgzfStream::CheckBlockHeader(char* header) {
69     return (header[0] == Constants::GZIP_ID1 &&
70             header[1] == Constants::GZIP_ID2 &&
71             header[2] == Z_DEFLATED &&
72             (header[3] & Constants::FLG_FEXTRA) != 0 &&
73             BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
74             header[12] == Constants::BGZF_ID1 &&
75             header[13] == Constants::BGZF_ID2 &&
76             BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
77 }
78
79 // closes BGZF file
80 void BgzfStream::Close(void) {
81
82     // skip if file not open
83     if ( !IsOpen )
84         return;
85
86     // if writing to file, flush the current BGZF block,
87     // then write an empty block (as EOF marker)
88     if ( IsWriteOnly ) {
89         FlushBlock();
90         const size_t blockLength = DeflateBlock();
91         fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
92     }
93
94     // flush and close stream
95     fflush(Resources.Stream);
96     fclose(Resources.Stream);
97     Resources.Stream = 0;
98
99     // reset initial state
100     BlockLength = 0;
101     BlockOffset = 0;
102     BlockAddress = 0;
103     IsOpen = false;
104     IsWriteOnly = false;
105     IsWriteCompressed = true;
106 }
107
108 // compresses the current block
109 size_t BgzfStream::DeflateBlock(void) {
110
111     // initialize the gzip header
112     char* buffer = Resources.CompressedBlock;
113     memset(buffer, 0, 18);
114     buffer[0]  = Constants::GZIP_ID1;
115     buffer[1]  = Constants::GZIP_ID2;
116     buffer[2]  = Constants::CM_DEFLATE;
117     buffer[3]  = Constants::FLG_FEXTRA;
118     buffer[9]  = Constants::OS_UNKNOWN;
119     buffer[10] = Constants::BGZF_XLEN;
120     buffer[12] = Constants::BGZF_ID1;
121     buffer[13] = Constants::BGZF_ID2;
122     buffer[14] = Constants::BGZF_LEN;
123
124     // set compression level
125     const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
126
127     // loop to retry for blocks that do not compress enough
128     int inputLength = BlockOffset;
129     size_t compressedLength = 0;
130     const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
131
132     while ( true ) {
133
134         // initialize zstream values
135         z_stream zs;
136         zs.zalloc    = NULL;
137         zs.zfree     = NULL;
138         zs.next_in   = (Bytef*)Resources.UncompressedBlock;
139         zs.avail_in  = inputLength;
140         zs.next_out  = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
141         zs.avail_out = bufferSize -
142                        Constants::BGZF_BLOCK_HEADER_LENGTH -
143                        Constants::BGZF_BLOCK_FOOTER_LENGTH;
144
145         // initialize the zlib compression algorithm
146         int status = deflateInit2(&zs,
147                                   compressionLevel,
148                                   Z_DEFLATED,
149                                   Constants::GZIP_WINDOW_BITS,
150                                   Constants::Z_DEFAULT_MEM_LEVEL,
151                                   Z_DEFAULT_STRATEGY);
152         if ( status != Z_OK )
153             throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
154
155         // compress the data
156         status = deflate(&zs, Z_FINISH);
157
158         // if not at stream end
159         if ( status != Z_STREAM_END ) {
160
161             deflateEnd(&zs);
162
163             // if error status
164             if ( status != Z_OK )
165                 throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
166
167             // not enough space available in buffer
168             // try to reduce the input length & re-start loop
169             inputLength -= 1024;
170             if ( inputLength <= 0 )
171                 throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
172             continue;
173         }
174
175         // finalize the compression routine
176         status = deflateEnd(&zs);
177         if ( status != Z_OK )
178             throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
179
180         // update compressedLength
181         compressedLength = zs.total_out +
182                            Constants::BGZF_BLOCK_HEADER_LENGTH +
183                            Constants::BGZF_BLOCK_FOOTER_LENGTH;
184         if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
185             throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
186
187         // quit while loop
188         break;
189     }
190
191     // store the compressed length
192     BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
193
194     // store the CRC32 checksum
195     uint32_t crc = crc32(0, NULL, 0);
196     crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
197     BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
198     BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
199
200     // ensure that we have less than a block of data left
201     int remaining = BlockOffset - inputLength;
202     if ( remaining > 0 ) {
203         if ( remaining > inputLength )
204             throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
205         memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
206     }
207
208     // update block data & return compressedlength
209     BlockOffset = remaining;
210     return compressedLength;
211 }
212
213 // flushes the data in the BGZF block
214 void BgzfStream::FlushBlock(void) {
215
216     // flush all of the remaining blocks
217     while ( BlockOffset > 0 ) {
218
219         // compress the data block
220         const size_t blockLength = DeflateBlock();
221
222         // flush the data to our output stream
223         const size_t numBytesWritten = fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
224         if ( numBytesWritten != blockLength ) {
225             stringstream s("");
226             s << "expected to write " << blockLength
227               << " bytes during flushing, but wrote " << numBytesWritten;
228             throw BamException("BgzfStream::FlushBlock", s.str());
229         }
230
231         // update block data
232         BlockAddress += blockLength;
233     }
234 }
235
236 // decompresses the current block
237 size_t BgzfStream::InflateBlock(const size_t& blockLength) {
238
239     // setup zlib stream object
240     z_stream zs;
241     zs.zalloc    = NULL;
242     zs.zfree     = NULL;
243     zs.next_in   = (Bytef*)Resources.CompressedBlock + 18;
244     zs.avail_in  = blockLength - 16;
245     zs.next_out  = (Bytef*)Resources.UncompressedBlock;
246     zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
247
248     // initialize
249     int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
250     if ( status != Z_OK )
251         throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
252
253     // decompress
254     status = inflate(&zs, Z_FINISH);
255     if ( status != Z_STREAM_END ) {
256         inflateEnd(&zs);
257         throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
258     }
259
260     // finalize
261     status = inflateEnd(&zs);
262     if ( status != Z_OK ) {
263         inflateEnd(&zs);
264         throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
265     }
266
267     // return result
268     return zs.total_out;
269 }
270
271 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
272 void BgzfStream::Open(const string& filename, const char* mode) {
273
274     // make sure we're starting with fresh state
275     if ( IsOpen )
276         Close();
277
278     // determine open mode
279     if ( strcmp(mode, "rb") == 0 )
280         IsWriteOnly = false;
281     else if ( strcmp(mode, "wb") == 0)
282         IsWriteOnly = true;
283     else {
284         const string message = string("unknown file mode: ") + mode;
285         throw BamException("BgzfStream::Open", message);
286     }
287
288     // open BGZF stream on a file
289     if ( (filename != "stdin") && (filename != "stdout") && (filename != "-"))
290         Resources.Stream = fopen(filename.c_str(), mode);
291
292     // open BGZF stream on stdin
293     else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) )
294         Resources.Stream = freopen(NULL, mode, stdin);
295
296     // open BGZF stream on stdout
297     else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
298         Resources.Stream = freopen(NULL, mode, stdout);
299
300     // ensure valid Stream
301     if ( !Resources.Stream ) {
302         const string message = string("unable to open file: ") + filename;
303         throw BamException("BgzfStream::Open", message);
304     }
305
306     // set flag
307     IsOpen = true;
308 }
309
310 // reads BGZF data into a byte buffer
311 size_t BgzfStream::Read(char* data, const size_t dataLength) {
312
313     // if stream not open for reading (or empty request)
314     if ( !IsOpen || IsWriteOnly || dataLength == 0 )
315         return 0;
316
317     // read blocks as needed until desired data length is retrieved
318     size_t numBytesRead = 0;
319     while ( numBytesRead < dataLength ) {
320
321         // determine bytes available in current block
322         int bytesAvailable = BlockLength - BlockOffset;
323
324         // read (and decompress) next block if needed
325         if ( bytesAvailable <= 0 ) {
326             ReadBlock();
327             bytesAvailable = BlockLength - BlockOffset;
328             if ( bytesAvailable <= 0 )
329                 break;
330         }
331
332         // copy data from uncompressed source buffer into data destination buffer
333         const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
334         memcpy(data, Resources.UncompressedBlock + BlockOffset, copyLength);
335
336         // update counters
337         BlockOffset  += copyLength;
338         data         += copyLength;
339         numBytesRead += copyLength;
340     }
341
342     // update block data
343     if ( BlockOffset == BlockLength ) {
344         BlockAddress = ftell64(Resources.Stream);
345         BlockOffset  = 0;
346         BlockLength  = 0;
347     }
348
349     // return actual number of bytes read
350     return numBytesRead;
351 }
352
353 // reads a BGZF block
354 void BgzfStream::ReadBlock(void) {
355
356     // store block start
357     int64_t blockAddress = ftell64(Resources.Stream);
358
359     // read block header from file
360     char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
361     size_t count = fread(header, 1, Constants::BGZF_BLOCK_HEADER_LENGTH, Resources.Stream);
362
363     // if block header empty, set marker & skip rest of method
364     if ( count == 0 ) {
365         BlockLength = 0;
366         return;
367     }
368
369     // if block header invalid size
370     if ( count != sizeof(header) )
371         throw BamException("BgzfStream::ReadBlock", "invalid block header size");
372
373     // validate block header contents
374     if ( !BgzfStream::CheckBlockHeader(header) )
375         throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
376
377     // copy header contents to compressed buffer
378     const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
379     memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
380
381     // read remainder of block
382     const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
383     count = fread(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Resources.Stream);
384     if ( count != remaining )
385         throw BamException("BgzfStream::ReadBlock", "could not read data from block");
386
387     // decompress block data
388     count = InflateBlock(blockLength);
389
390     // update block metadata
391     if ( BlockLength != 0 )
392         BlockOffset = 0;
393     BlockAddress = blockAddress;
394     BlockLength  = count;
395 }
396
397 // seek to position in BGZF file
398 void BgzfStream::Seek(const int64_t& position) {
399
400     // determine adjusted offset & address
401     int     blockOffset  = (position & 0xFFFF);
402     int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
403
404     // attempt seek in file
405     if ( fseek64(Resources.Stream, blockAddress, SEEK_SET) != 0 ) {
406         stringstream s("");
407         s << "unable to seek to position: " << position;
408         throw BamException("BgzfStream::Seek", s.str());
409     }
410
411     // if successful, update block metadata
412     BlockLength  = 0;
413     BlockAddress = blockAddress;
414     BlockOffset  = blockOffset;
415 }
416
417 void BgzfStream::SetWriteCompressed(bool ok) {
418     IsWriteCompressed = ok;
419 }
420
421 // get file position in BGZF file
422 int64_t BgzfStream::Tell(void) const {
423     if ( !IsOpen )
424         return 0;
425     return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
426 }
427
428 // writes the supplied data into the BGZF buffer
429 size_t BgzfStream::Write(const char* data, const size_t dataLength) {
430
431     // skip if file not open for writing
432     if ( !IsOpen || !IsWriteOnly )
433         return false;
434
435     // write blocks as needed til all data is written
436     size_t numBytesWritten = 0;
437     const char* input = data;
438     const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
439     while ( numBytesWritten < dataLength ) {
440
441         // copy data contents to uncompressed output buffer
442         const size_t copyLength = min(blockLength - BlockOffset, dataLength - numBytesWritten);
443         char* buffer = Resources.UncompressedBlock;
444         memcpy(buffer + BlockOffset, input, copyLength);
445
446         // update counters
447         BlockOffset     += copyLength;
448         input           += copyLength;
449         numBytesWritten += copyLength;
450
451         // flush (& compress) output buffer when full
452         if ( BlockOffset == blockLength )
453             FlushBlock();
454     }
455
456     // return actual number of bytes written
457     return numBytesWritten;
458 }