BamWriter.cpp

   1 // ***************************************************************************
   2 // BamWriter (c) 2009 Michael Strömberg
   3 // Marth Lab, Deptartment of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
   7 // Institute.
   8 // ---------------------------------------------------------------------------
   9 // Provides the basic functionality for producing BAM files
  10 // ***************************************************************************
  11
  12 #include "BamWriter.h"
  13
  14 // constructor
  15 BamWriter::BamWriter(void)
  16 {}
  17
  18 // destructor
  19 BamWriter::~BamWriter(void) {
  20         if(mBGZF.IsOpen) BgzfClose();
  21 }
  22
  23 // closes the BAM file
  24 void BamWriter::BgzfClose(void) {
  25
  26         mBGZF.IsOpen = false;
  27
  28         // flush the BGZF block
  29         BgzfFlushBlock();
  30
  31         // flush and close
  32         fflush(mBGZF.Stream);
  33         fclose(mBGZF.Stream);
  34 }
  35
  36 // compresses the current block
  37 int BamWriter::BgzfDeflateBlock(void) {
  38
  39         // initialize the gzip header
  40         char* buffer = mBGZF.CompressedBlock;
  41         unsigned int bufferSize = mBGZF.CompressedBlockSize;
  42
  43         memset(buffer, 0, 18);
  44         buffer[0]  = GZIP_ID1;
  45         buffer[1]  = (char)GZIP_ID2;
  46         buffer[2]  = CM_DEFLATE;
  47         buffer[3]  = FLG_FEXTRA;
  48         buffer[9]  = (char)OS_UNKNOWN;
  49         buffer[10] = BGZF_XLEN;
  50         buffer[12] = BGZF_ID1;
  51         buffer[13] = BGZF_ID2;
  52         buffer[14] = BGZF_LEN;
  53
  54         // loop to retry for blocks that do not compress enough
  55         int inputLength = mBGZF.BlockOffset;
  56         int compressedLength = 0;
  57
  58         while(true) {
  59
  60                 z_stream zs;
  61                 zs.zalloc    = NULL;
  62                 zs.zfree     = NULL;
  63                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
  64                 zs.avail_in  = inputLength;
  65                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
  66                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
  67
  68                 // initialize the zlib compression algorithm
  69                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
  70                         printf("ERROR: zlib deflate initialization failed.\n");
  71                         exit(1);
  72                 }
  73
  74                 // compress the data
  75                 int status = deflate(&zs, Z_FINISH);
  76                 if(status != Z_STREAM_END) {
  77                         deflateEnd(&zs);
  78
  79                         // reduce the input length and try again
  80                         if(status == Z_OK) {
  81                                 inputLength -= 1024;
  82                                 if(inputLength < 0) {
  83                                         printf("ERROR: input reduction failed.\n");
  84                                         exit(1);
  85                                 }
  86                                 continue;
  87                         }
  88
  89                         printf("ERROR: zlib deflate failed.\n");
  90                         exit(1);
  91                 }
  92
  93                 // finalize the compression routine
  94                 if(deflateEnd(&zs) != Z_OK) {
  95                         printf("ERROR: deflate end failed.\n");
  96                         exit(1);
  97                 }
  98
  99                 compressedLength = zs.total_out;
 100                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
 101
 102                 if(compressedLength > MAX_BLOCK_SIZE) {
 103                         printf("ERROR: deflate overflow.\n");
 104                         exit(1);
 105                 }
 106
 107                 break;
 108         }
 109
 110         // store the compressed length
 111         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
 112
 113         // store the CRC32 checksum
 114         unsigned int crc = crc32(0, NULL, 0);
 115         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
 116         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
 117         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
 118
 119         // ensure that we have less than a block of data left
 120         int remaining = mBGZF.BlockOffset - inputLength;
 121         if(remaining > 0) {
 122                 if(remaining > inputLength) {
 123                         printf("ERROR: remainder too large.\n");
 124                         exit(1);
 125                 }
 126
 127                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
 128         }
 129
 130         mBGZF.BlockOffset = remaining;
 131         return compressedLength;
 132 }
 133
 134 // flushes the data in the BGZF block
 135 void BamWriter::BgzfFlushBlock(void) {
 136
 137         // flush all of the remaining blocks
 138         while(mBGZF.BlockOffset > 0) {
 139
 140                 // compress the data block
 141                 int blockLength = BgzfDeflateBlock();
 142
 143                 // flush the data to our output stream
 144                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
 145
 146                 if(numBytesWritten != blockLength) {
 147                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
 148                         exit(1);
 149                 }
 150
 151                 mBGZF.BlockAddress += blockLength;
 152         }
 153 }
 154
 155 // opens the BAM file for writing
 156 void BamWriter::BgzfOpen(const string& filename) {
 157
 158         mBGZF.Stream = fopen(filename.c_str(), "wb");
 159
 160         if(!mBGZF.Stream) {
 161                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
 162                 exit(1);
 163         }
 164
 165         mBGZF.IsOpen = true;
 166 }
 167
 168 // writes the supplied data into the BGZF buffer
 169 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
 170
 171         // initialize
 172         unsigned int numBytesWritten = 0;
 173         const char* input = data;
 174         unsigned int blockLength = mBGZF.UncompressedBlockSize;
 175
 176         // copy the data to the buffer
 177         while(numBytesWritten < dataLen) {
 178                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
 179                 char* buffer = mBGZF.UncompressedBlock;
 180                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
 181
 182                 mBGZF.BlockOffset += copyLength;
 183                 input             += copyLength;
 184                 numBytesWritten   += copyLength;
 185
 186                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
 187         }
 188
 189         return numBytesWritten;
 190 }
 191
 192 // closes the alignment archive
 193 void BamWriter::Close(void) {
 194         if(mBGZF.IsOpen) BgzfClose();
 195 }
 196
 197 // creates a cigar string from the supplied alignment
 198 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
 199
 200         // initialize
 201         const unsigned int numCigarOperations = cigarOperations.size();
 202         packedCigar.resize(numCigarOperations * SIZEOF_INT);
 203
 204         // pack the cigar data into the string
 205         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
 206
 207         unsigned int cigarOp;
 208         vector<CigarOp>::const_iterator coIter;
 209         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
 210
 211                 switch(coIter->Type) {
 212           case 'M':
 213                   cigarOp = BAM_CMATCH;
 214                   break;
 215           case 'I':
 216                   cigarOp = BAM_CINS;
 217                   break;
 218           case 'D':
 219                   cigarOp = BAM_CDEL;
 220                   break;
 221           case 'N':
 222                   cigarOp = BAM_CREF_SKIP;
 223                   break;
 224           case 'S':
 225                   cigarOp = BAM_CSOFT_CLIP;
 226                   break;
 227           case 'H':
 228                   cigarOp = BAM_CHARD_CLIP;
 229                   break;
 230           case 'P':
 231                   cigarOp = BAM_CPAD;
 232                   break;
 233           default:
 234                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
 235                   exit(1);
 236                 }
 237
 238                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
 239                 pPackedCigar++;
 240         }
 241 }
 242
 243 // encodes the supplied query sequence into 4-bit notation
 244 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
 245
 246         // prepare the encoded query string
 247         const unsigned int queryLen = query.size();
 248         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
 249         encodedQuery.resize(encodedQueryLen);
 250         char* pEncodedQuery = (char*)encodedQuery.data();
 251         const char* pQuery = (const char*)query.data();
 252
 253         unsigned char nucleotideCode;
 254         bool useHighWord = true;
 255
 256         while(*pQuery) {
 257
 258                 switch(*pQuery) {
 259                         case '=':
 260                                 nucleotideCode = 0;
 261                                 break;
 262                         case 'A':
 263                                 nucleotideCode = 1;
 264                                 break;
 265                         case 'C':
 266                                 nucleotideCode = 2;
 267                                 break;
 268                         case 'G':
 269                                 nucleotideCode = 4;
 270                                 break;
 271                         case 'T':
 272                                 nucleotideCode = 8;
 273                                 break;
 274                         case 'N':
 275                                 nucleotideCode = 15;
 276                                 break;
 277                         default:
 278                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
 279                                 exit(1);
 280                 }
 281
 282                 // pack the nucleotide code
 283                 if(useHighWord) {
 284                         *pEncodedQuery = nucleotideCode << 4;
 285                         useHighWord = false;
 286                 } else {
 287                         *pEncodedQuery |= nucleotideCode;
 288                         pEncodedQuery++;
 289                         useHighWord = true;
 290                 }
 291
 292                 // increment the query position
 293                 pQuery++;
 294         }
 295 }
 296
 297 // opens the alignment archive
 298 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
 299
 300         // open the BGZF file for writing
 301         BgzfOpen(filename);
 302
 303         // ================
 304         // write the header
 305         // ================
 306
 307         // write the BAM signature
 308         const unsigned char SIGNATURE_LENGTH = 4;
 309         const char* BAM_SIGNATURE = "BAM\1";
 310         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
 311
 312         // write the SAM header text length
 313         const unsigned int samHeaderLen = samHeader.size();
 314         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
 315
 316         // write the SAM header text
 317         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
 318
 319         // write the number of reference sequences
 320         const unsigned int numReferenceSequences = referenceSequences.size();
 321         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
 322
 323         // =============================
 324         // write the sequence dictionary
 325         // =============================
 326
 327         RefVector::const_iterator rsIter;
 328         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
 329
 330                 // write the reference sequence name length
 331                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
 332                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
 333
 334                 // write the reference sequence name
 335                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
 336
 337                 // write the reference sequence length
 338                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
 339         }
 340 }
 341
 342 // saves the alignment to the alignment archive
 343 void BamWriter::SaveAlignment(const BamAlignment& al) {
 344
 345         // initialize
 346         const unsigned int nameLen            = al.Name.size() + 1;
 347         const unsigned int queryLen           = al.QueryBases.size();
 348         const unsigned int numCigarOperations = al.CigarData.size();
 349
 350         // create our packed cigar string
 351         string packedCigar;
 352         CreatePackedCigar(al.CigarData, packedCigar);
 353         const unsigned int packedCigarLen = packedCigar.size();
 354
 355         // encode the query
 356         string encodedQuery;
 357         EncodeQuerySequence(al.QueryBases, encodedQuery);
 358         const unsigned int encodedQueryLen = encodedQuery.size();
 359
 360         // store the tag data length
 361         const unsigned int tagDataLength = al.TagData.size();
 362
 363         // assign the BAM core data
 364         unsigned int buffer[8];
 365         buffer[0] = al.RefID;
 366         buffer[1] = al.Position;
 367         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
 368         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
 369         buffer[4] = queryLen;
 370         buffer[5] = al.MateRefID;
 371         buffer[6] = al.MatePosition;
 372         buffer[7] = al.InsertSize;
 373
 374         // write the block size
 375         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
 376         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
 377
 378         BgzfWrite((char*)&blockSize, SIZEOF_INT);
 379
 380         // write the BAM core
 381         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
 382
 383         // write the query name
 384         BgzfWrite(al.Name.c_str(), nameLen);
 385
 386         // write the packed cigar
 387         BgzfWrite(packedCigar.data(), packedCigarLen);
 388
 389         // write the encoded query sequence
 390         BgzfWrite(encodedQuery.data(), encodedQueryLen);
 391
 392         // write the base qualities
 393         string baseQualities = al.Qualities;
 394         char* pBaseQualities = (char*)al.Qualities.data();
 395         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
 396         BgzfWrite(pBaseQualities, queryLen);
 397
 398         // write the read group tag
 399         BgzfWrite(al.TagData.data(), tagDataLength);
 400 }