1 // ***************************************************************************
2 // BamWriter (c) 2009 Michael Strömberg
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for producing BAM files
7 // ***************************************************************************
12 BamWriter::BamWriter(void)
16 BamWriter::~BamWriter(void) {
17 if(mBGZF.IsOpen) BgzfClose();
20 // closes the BAM file
21 void BamWriter::BgzfClose(void) {
25 // flush the BGZF block
33 // compresses the current block
34 int BamWriter::BgzfDeflateBlock(void) {
36 // initialize the gzip header
37 char* buffer = mBGZF.CompressedBlock;
38 unsigned int bufferSize = mBGZF.CompressedBlockSize;
40 memset(buffer, 0, 18);
42 buffer[1] = (char)GZIP_ID2;
43 buffer[2] = CM_DEFLATE;
44 buffer[3] = FLG_FEXTRA;
45 buffer[9] = (char)OS_UNKNOWN;
46 buffer[10] = BGZF_XLEN;
47 buffer[12] = BGZF_ID1;
48 buffer[13] = BGZF_ID2;
49 buffer[14] = BGZF_LEN;
51 // loop to retry for blocks that do not compress enough
52 int inputLength = mBGZF.BlockOffset;
53 int compressedLength = 0;
60 zs.next_in = (Bytef*)mBGZF.UncompressedBlock;
61 zs.avail_in = inputLength;
62 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
63 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
65 // initialize the zlib compression algorithm
66 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
67 printf("ERROR: zlib deflate initialization failed.\n");
72 int status = deflate(&zs, Z_FINISH);
73 if(status != Z_STREAM_END) {
76 // reduce the input length and try again
80 printf("ERROR: input reduction failed.\n");
86 printf("ERROR: zlib deflate failed.\n");
90 // finalize the compression routine
91 if(deflateEnd(&zs) != Z_OK) {
92 printf("ERROR: deflate end failed.\n");
96 compressedLength = zs.total_out;
97 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
99 if(compressedLength > MAX_BLOCK_SIZE) {
100 printf("ERROR: deflate overflow.\n");
107 // store the compressed length
108 BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
110 // store the CRC32 checksum
111 unsigned int crc = crc32(0, NULL, 0);
112 crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
113 BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
114 BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
116 // ensure that we have less than a block of data left
117 int remaining = mBGZF.BlockOffset - inputLength;
119 if(remaining > inputLength) {
120 printf("ERROR: remainder too large.\n");
124 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
127 mBGZF.BlockOffset = remaining;
128 return compressedLength;
131 // flushes the data in the BGZF block
132 void BamWriter::BgzfFlushBlock(void) {
134 // flush all of the remaining blocks
135 while(mBGZF.BlockOffset > 0) {
137 // compress the data block
138 int blockLength = BgzfDeflateBlock();
140 // flush the data to our output stream
141 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
143 if(numBytesWritten != blockLength) {
144 printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
148 mBGZF.BlockAddress += blockLength;
152 // opens the BAM file for writing
153 void BamWriter::BgzfOpen(const string& filename) {
155 mBGZF.Stream = fopen(filename.c_str(), "wb");
158 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
165 // writes the supplied data into the BGZF buffer
166 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
169 unsigned int numBytesWritten = 0;
170 const char* input = data;
171 unsigned int blockLength = mBGZF.UncompressedBlockSize;
173 // copy the data to the buffer
174 while(numBytesWritten < dataLen) {
175 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
176 char* buffer = mBGZF.UncompressedBlock;
177 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
179 mBGZF.BlockOffset += copyLength;
181 numBytesWritten += copyLength;
183 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
186 return numBytesWritten;
189 // closes the alignment archive
190 void BamWriter::Close(void) {
191 if(mBGZF.IsOpen) BgzfClose();
194 // creates a cigar string from the supplied alignment
195 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
198 const unsigned int numCigarOperations = cigarOperations.size();
199 packedCigar.resize(numCigarOperations * SIZEOF_INT);
201 // pack the cigar data into the string
202 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
204 unsigned int cigarOp;
205 vector<CigarOp>::const_iterator coIter;
206 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
208 switch(coIter->Type) {
210 cigarOp = BAM_CMATCH;
219 cigarOp = BAM_CREF_SKIP;
222 cigarOp = BAM_CSOFT_CLIP;
225 cigarOp = BAM_CHARD_CLIP;
231 printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
235 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
240 // encodes the supplied query sequence into 4-bit notation
241 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
243 // prepare the encoded query string
244 const unsigned int queryLen = query.size();
245 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
246 encodedQuery.resize(encodedQueryLen);
247 char* pEncodedQuery = (char*)encodedQuery.data();
248 const char* pQuery = (const char*)query.data();
250 unsigned char nucleotideCode;
251 bool useHighWord = true;
275 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
279 // pack the nucleotide code
281 *pEncodedQuery = nucleotideCode << 4;
284 *pEncodedQuery |= nucleotideCode;
289 // increment the query position
294 // opens the alignment archive
295 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
297 // open the BGZF file for writing
304 // write the BAM signature
305 const unsigned char SIGNATURE_LENGTH = 4;
306 const char* BAM_SIGNATURE = "BAM\1";
307 BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
309 // write the SAM header text length
310 const unsigned int samHeaderLen = samHeader.size();
311 BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
313 // write the SAM header text
314 if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
316 // write the number of reference sequences
317 const unsigned int numReferenceSequences = referenceSequences.size();
318 BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
320 // =============================
321 // write the sequence dictionary
322 // =============================
324 RefVector::const_iterator rsIter;
325 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
327 // write the reference sequence name length
328 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
329 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
331 // write the reference sequence name
332 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
334 // write the reference sequence length
335 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
339 // saves the alignment to the alignment archive
340 void BamWriter::SaveAlignment(const BamAlignment& al) {
343 const unsigned int nameLen = al.Name.size() + 1;
344 const unsigned int queryLen = al.QueryBases.size();
345 const unsigned int numCigarOperations = al.CigarData.size();
347 // create our packed cigar string
349 CreatePackedCigar(al.CigarData, packedCigar);
350 const unsigned int packedCigarLen = packedCigar.size();
354 EncodeQuerySequence(al.QueryBases, encodedQuery);
355 const unsigned int encodedQueryLen = encodedQuery.size();
357 // create our read group tag
358 // TODO: add read group tag support when it shows up in the BamAlignment struct
359 //string readGroupTag;
360 //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
361 //readGroupTag.resize(readGroupTagLen);
362 //char* pReadGroupTag = (char*)readGroupTag.data();
363 //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
365 // assign the BAM core data
366 unsigned int buffer[8];
367 buffer[0] = al.RefID;
368 buffer[1] = al.Position;
369 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
370 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
371 buffer[4] = queryLen;
372 buffer[5] = al.MateRefID;
373 buffer[6] = al.MatePosition;
374 buffer[7] = al.InsertSize;
376 // write the block size
377 const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
378 const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
380 BgzfWrite((char*)&blockSize, SIZEOF_INT);
382 // write the BAM core
383 BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
385 // write the query name
386 BgzfWrite(al.Name.c_str(), nameLen);
388 // write the packed cigar
389 BgzfWrite(packedCigar.data(), packedCigarLen);
391 // write the encoded query sequence
392 BgzfWrite(encodedQuery.data(), encodedQueryLen);
394 // write the base qualities
395 string baseQualities = al.Qualities;
396 char* pBaseQualities = (char*)al.Qualities.data();
397 for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
398 BgzfWrite(pBaseQualities, queryLen);
400 // write the read group tag
401 // TODO: add read group tag support when it shows up in the BamAlignment struct
402 //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);